pax_global_header00006660000000000000000000000064145535212770014525gustar00rootroot0000000000000052 comment=f30fb2bbbd90aa9aaed1d79b9aaad8979cbde47c pdf2docx-0.5.8/000077500000000000000000000000001455352127700132505ustar00rootroot00000000000000pdf2docx-0.5.8/.github/000077500000000000000000000000001455352127700146105ustar00rootroot00000000000000pdf2docx-0.5.8/.github/workflows/000077500000000000000000000000001455352127700166455ustar00rootroot00000000000000pdf2docx-0.5.8/.github/workflows/publish.yml000066400000000000000000000066621455352127700210500ustar00rootroot00000000000000# Create release and publish to Pypi when pushing tags name: publish # Trigger the workflow on push tags, matching vM.n.p, i.e. v3.2.10 on: push: tags: - 'v[0-9]+.[0-9]+.[0-9]+' # Jobs to do: # - build package # - create release with asset # - publish to Pypi jobs: # ----------------------------------------------------------- # create python env and setup package # ----------------------------------------------------------- build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: - name: Check out code uses: actions/checkout@v2 - name: Set up Python 3.x uses: actions/setup-python@v1 with: python-version: '3.10' - name: Display Python version run: python -c "import sys; print(sys.version)" - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools wheel pytest # build package for tags, e.g. 3.2.1 extracted from 'refs/tags/v3.2.1' - name: Build package run: | echo ${GITHUB_REF#refs/tags/v} > version.txt python setup.py sdist --formats=gztar,zip python setup.py bdist_wheel # test wheel file locally - name: Test package run: | pip install ./dist/*.whl cd test pytest -v test.py::TestConversion # upload the artifacts for further jobs # - app-tag.tar.gz # - app-tag.zip # - app-tag-info.whl - name: Archive package uses: actions/upload-artifact@v2 with: name: dist path: ./dist # ----------------------------------------------------------- # create release and upload asset # ----------------------------------------------------------- release: runs-on: ubuntu-latest # Run this job after build completes successfully needs: build steps: - name: Checkout code uses: actions/checkout@v2 # download artifacts from job: build - name: Download artifacts from build uses: actions/download-artifact@v2 with: name: dist path: dist # create release and upload assets - name: Create release with assets uses: softprops/action-gh-release@v1 with: files: | ./dist/*.tar.gz ./dist/*.zip ./dist/*.whl env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # ----------------------------------------------------------- # publish to Pypi # ----------------------------------------------------------- publish: runs-on: ubuntu-latest # Run this job after both build and release completes successfully needs: [build, release] steps: - name: Checkout code uses: actions/checkout@v2 - name: Download artifacts from release uses: actions/download-artifact@v2 with: name: dist path: dist # Error when two sdist files created in build job are uploaded to Pipi: # HTTPError: 400 Bad Request from https://upload.pypi.org/legacy/ # Only one sdist may be uploaded per release. - name: Remove duplicated sdist run: rm ./dist/*.zip - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@master with: password: ${{ secrets.PYPI_PASSWORD }}pdf2docx-0.5.8/.github/workflows/test.yml000066400000000000000000000046311455352127700203530ustar00rootroot00000000000000# Run test when triggering the workflow on push and pull request, # but only for the master branch name: test on: push: branches: - master pull_request: branches: - master # ----------------------------------------------------------------------------------------------------- # To leverage the benefit of Github Action, the testing process is divided into three jobs: # 1. pdf2docx: convert sample pdf to docx -> linux runner # 2. docx2pdf: convert generated docx to pdf for comparing -> specific runner with MS Word installed # 3. check_quality: convert page to image and compare similarity with python-opencv -> linux runner # However, keep step 1 only, considering the difficulty to get a specific runner with MS Word installed. # ----------------------------------------------------------------------------------------------------- jobs: pdf2docx-docker: runs-on: ubuntu-latest container: image: python:3.8 steps: - name: Check out code uses: actions/checkout@v2 - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install pytest python setup.py develop - name: Run unit test run: | pytest -v ./test/test.py::TestConversion pdf2docx-ubuntu: runs-on: ubuntu-latest needs: pdf2docx-docker strategy: matrix: python-version: ["3.8", "3.9", "3.10"] steps: - name: Check out code uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install pytest pytest-cov python setup.py develop - name: Run unit test run: | pytest -v ./test/test.py::TestConversion --cov=./pdf2docx --cov-report=xml - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 with: # Or as an environment variable token: ${{ secrets.CODECOV_TOKEN }} # upload docx for further job - name: Archive package uses: actions/upload-artifact@v2 with: name: outputs path: ./test/outputspdf2docx-0.5.8/.gitignore000066400000000000000000000003141455352127700152360ustar00rootroot00000000000000# files *.pyc *.jp*g *.docx layout.json .vscode/ # pdf testing files *.pdf !demo*.pdf *coverage* test/issues/ test/features/ test/outputs/ diff.png # building dir build/ dist/ *egg-info/ pdf2docx*.rstpdf2docx-0.5.8/.readthedocs.yaml000066400000000000000000000013361455352127700165020ustar00rootroot00000000000000# .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-20.04 tools: python: "3.9" # You can also specify other tool versions: # nodejs: "16" # rust: "1.55" # golang: "1.17" # Build documentation in the docs/ directory with Sphinx sphinx: configuration: doc/conf.py # If using Sphinx, optionally build your docs in additional formats such as PDF #formats: # - pdf # Optionally declare the Python requirements required to build your docs python: install: - requirements: doc/requirements.txt pdf2docx-0.5.8/AFFERO GPL000066400000000000000000001033031455352127700145400ustar00rootroot00000000000000GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see .pdf2docx-0.5.8/LICENSE000066400000000000000000001045141455352127700142620ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read .pdf2docx-0.5.8/MANIFEST.in000066400000000000000000000001561455352127700150100ustar00rootroot00000000000000include *.md include LICENSE* include requirements.txt prune test include test/*.py include test/samples/*.pdfpdf2docx-0.5.8/Makefile000066400000000000000000000017531455352127700147160ustar00rootroot00000000000000# Project makefile # working directories and files # TOPDIR :=$(shell pwd) SRC :=$(TOPDIR)/pdf2docx BUILD :=$(TOPDIR)/build DOCSRC :=$(TOPDIR)/doc TEST :=$(TOPDIR)/test CLEANDIRS :=.pytest_cache pdf2docx.egg-info dist # pip install sphinx_rtd_theme .PHONY: src doc test clean src: @python setup.py sdist --formats=gztar,zip && \ python setup.py bdist_wheel doc: @if [ -f "$(DOCSRC)/Makefile" ] ; then \ ( cd "$(DOCSRC)" && make html MODULEDIR="$(SRC)" BUILDDIR="$(BUILD)" ) || exit 1 ; \ fi test: @if [ -f "$(TEST)/Makefile" ] ; then \ ( cd "$(TEST)" && make test SOURCEDIR="$(SRC)" ) || exit 1 ; \ fi clean: @if [ -e "$(DOCSRC)/Makefile" ] ; then \ ( cd "$(DOCSRC)" && make $@ BUILDDIR="$(BUILD)" ) || exit 0 ; \ fi @for p in $(CLEANDIRS) ; do \ if [ -d "$(TOPDIR)/$$p" ]; then rm -rf "$(TOPDIR)/$$p" ; fi ; \ done @if [ -d "$(BUILD)" ]; then rm -rf "$(BUILD)" ; fi @if [ -e "$(TEST)/Makefile" ] ; then \ ( cd "$(TEST)" && make $@ ) || exit 0 ; \ fipdf2docx-0.5.8/README.md000066400000000000000000000051711455352127700145330ustar00rootroot00000000000000English | [中文](README_CN.md) # pdf2docx ![python-version](https://img.shields.io/badge/python->=3.6-green.svg) [![codecov](https://codecov.io/gh/dothinking/pdf2docx/branch/master/graph/badge.svg)](https://codecov.io/gh/dothinking/pdf2docx) [![pypi-version](https://img.shields.io/pypi/v/pdf2docx.svg)](https://pypi.python.org/pypi/pdf2docx/) ![license](https://img.shields.io/pypi/l/pdf2docx.svg) ![pypi-downloads](https://img.shields.io/pypi/dm/pdf2docx) - Extract data from PDF with `PyMuPDF`, e.g. text, images and drawings - Parse layout with rule, e.g. sections, paragraphs, images and tables - Generate docx with `python-docx` ## Features - Parse and re-create page layout - page margin - section and column (1 or 2 columns only) - page header and footer [TODO] - Parse and re-create paragraph - OCR text [TODO] - text in horizontal/vertical direction: from left to right, from bottom to top - font style, e.g. font name, size, weight, italic and color - text format, e.g. highlight, underline, strike-through - list style [TODO] - external hyper link - paragraph horizontal alignment (left/right/center/justify) and vertical spacing - Parse and re-create image - in-line image - image in Gray/RGB/CMYK mode - transparent image - floating image, i.e. picture behind text - Parse and re-create table - border style, e.g. width, color - shading style, i.e. background color - merged cells - vertical direction cell - table with partly hidden borders - nested tables - Parsing pages with multi-processing *It can also be used as a tool to extract table contents since both table content and format/style is parsed.* ## Limitations - Text-based PDF file - Left to right language - Normal reading direction, no word transformation / rotation - Rule-based method can't 100% convert the PDF layout ## Documentation - [Installation](https://pdf2docx.readthedocs.io/en/latest/installation.html) - [Quickstart](https://pdf2docx.readthedocs.io/en/latest/quickstart.html) - [Convert PDF](https://pdf2docx.readthedocs.io/en/latest/quickstart.convert.html) - [Extract table](https://pdf2docx.readthedocs.io/en/latest/quickstart.table.html) - [Command Line Interface](https://pdf2docx.readthedocs.io/en/latest/quickstart.cli.html) - [Graphic User Interface](https://pdf2docx.readthedocs.io/en/latest/quickstart.gui.html) - [Technical Documentation (In Chinese)](https://pdf2docx.readthedocs.io/en/latest/techdoc.html) - [API Documentation](https://pdf2docx.readthedocs.io/en/latest/modules.html) ## Sample ![sample_compare.png](https://s1.ax1x.com/2020/08/04/aDryx1.png)pdf2docx-0.5.8/README_CN.md000066400000000000000000000050401455352127700151060ustar00rootroot00000000000000[English](README.md) | 中文 # pdf2docx ![python-version](https://img.shields.io/badge/python->=3.6-green.svg) [![codecov](https://codecov.io/gh/dothinking/pdf2docx/branch/master/graph/badge.svg)](https://codecov.io/gh/dothinking/pdf2docx) [![pypi-version](https://img.shields.io/pypi/v/pdf2docx.svg)](https://pypi.python.org/pypi/pdf2docx/) ![license](https://img.shields.io/pypi/l/pdf2docx.svg) ![pypi-downloads](https://img.shields.io/pypi/dm/pdf2docx) - 基于 `PyMuPDF` 提取文本、图片、矢量等原始数据 - 基于规则解析章节、段落、表格、图片、文本等布局及样式 - 基于 `python-docx` 创建Word文档 ## 主要功能 - 解析和创建页面布局 - 页边距 - 章节和分栏 (目前最多支持两栏布局) - 页眉和页脚 [TODO] - 解析和创建段落 - OCR 文本 [TODO] - 水平(从左到右)或竖直(自底向上)方向文本 - 字体样式例如字体、字号、粗/斜体、颜色 - 文本样式例如高亮、下划线和删除线 - 列表样式 [TODO] - 外部超链接 - 段落水平对齐方式 (左/右/居中/分散对齐)及前后间距 - 解析和创建图片 - 内联图片 - 灰度/RGB/CMYK等颜色空间图片 - 带有透明通道图片 - 浮动图片(衬于文字下方) - 解析和创建表格 - 边框样式例如宽度和颜色 - 单元格背景色 - 合并单元格 - 单元格垂直文本 - 隐藏部分边框线的表格 - 嵌套表格 - 支持多进程转换 *`pdf2docx`同时解析出了表格内容和样式,因此也可以作为一个表格内容提取工具。* ## 限制 - 目前暂不支持扫描PDF文字识别 - 仅支持从左向右书写的语言(因此不支持阿拉伯语) - 不支持旋转的文字 - 基于规则的解析无法保证100%还原PDF样式 ## 使用帮助 - [安装](https://pdf2docx.readthedocs.io/en/latest/installation.html) - [快速上手](https://pdf2docx.readthedocs.io/en/latest/quickstart.html) - [转换PDF](https://pdf2docx.readthedocs.io/en/latest/quickstart.convert.html) - [提取表格](https://pdf2docx.readthedocs.io/en/latest/quickstart.table.html) - [命令行参数](https://pdf2docx.readthedocs.io/en/latest/quickstart.cli.html) - [简单图形界面](https://pdf2docx.readthedocs.io/en/latest/quickstart.gui.html) - [技术手册](https://pdf2docx.readthedocs.io/en/latest/techdoc.html) - [API手册](https://pdf2docx.readthedocs.io/en/latest/modules.html) ## 样例 ![sample_compare.png](https://s1.ax1x.com/2020/08/04/aDryx1.png)pdf2docx-0.5.8/doc/000077500000000000000000000000001455352127700140155ustar00rootroot00000000000000pdf2docx-0.5.8/doc/Makefile000066400000000000000000000007151455352127700154600ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # MODULEDIR and BUILDDIR are set in top makefile SOURCEDIR = . TARGETDIR = doctrees html .PHONY: html clean html: Makefile @sphinx-apidoc --separate -o "$(SOURCEDIR)" "$(MODULEDIR)" && \ sphinx-build -M html "$(SOURCEDIR)" "$(BUILDDIR)" clean: @for p in $(TARGETDIR) ; do \ if [ -d "$(BUILDDIR)/$$p" ]; then rm -rf "$(BUILDDIR)/$$p" ; fi ; \ done @if [ -e modules.rst ]; then rm pdf2docx*.rst ; fipdf2docx-0.5.8/doc/api/000077500000000000000000000000001455352127700145665ustar00rootroot00000000000000pdf2docx-0.5.8/doc/api/modules.rst000066400000000000000000000000751455352127700167720ustar00rootroot00000000000000pdf2docx ======== .. toctree:: :maxdepth: 4 pdf2docx pdf2docx-0.5.8/doc/conf.py000066400000000000000000000103311455352127700153120ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath("../pdf2docx/")) # -- Project information ----------------------------------------------------- project = 'pdf2docx' copyright = '2023, Artifex' author = 'Artifex Software, Inc.' # The full version, including alpha/beta/rc tags # read version number from version.txt, otherwise alpha version # Github CI can create version.txt dynamically. def get_version(fname): if os.path.exists(fname): with open(fname, 'r') as f: version = f.readline().strip() else: version = 'alpha' return version release = get_version('../version.txt') # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinxcontrib.apidoc' ] apidoc_module_dir = '../pdf2docx' apidoc_output_dir = 'api' apidoc_excluded_paths = [] apidoc_separate_modules = True # Add any paths that contain templates here, relative to this directory. # templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [ ] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # # html_theme = 'alabaster' html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # -- Options for LaTeX output --------------------------------------------- latex_elements = { # "fontpkg": r"\usepackage[sfdefault]{ClearSans} \usepackage[T1]{fontenc}" } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [("index", "pdf2docx.tex", "pdf2docx Documentation", "Artifex", "manual")] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = "images/pymupdf-logo.png" # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # If true, show page references after internal links. latex_show_pagerefs = False # If true, show URL addresses after external links. # latex_show_urls = True # latex_use_xindy = True # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. latex_domain_indices = True # -- Options for PDF output -------------------------------------------------- # Grouping the document tree into PDF files. List of tuples # (source start file, target name, title, author). pdf_documents = [("index", "pdf2docx", "pdf2docx manual", "Artifex")] # A comma-separated list of custom stylesheets. Example: # pdf_stylesheets = ["sphinx", "bahnschrift", "a4"] # Create a compressed PDF pdf_compressed = True # A colon-separated list of folders to search for fonts. Example: # pdf_font_path=['/usr/share/fonts', '/usr/share/texmf-dist/fonts/'] # Language to be used for hyphenation support pdf_language = "en_US" # If false, no index is generated. pdf_use_index = True # If false, no modindex is generated. pdf_use_modindex = True # If false, no coverpage is generated. pdf_use_coverpage = True pdf_break_level = 2 pdf_verbosity = 0 pdf_invariant = True pdf2docx-0.5.8/doc/index.rst000066400000000000000000000011071455352127700156550ustar00rootroot00000000000000Welcome to pdf2docx's documentation! ==================================== `pdf2docx `_ is a Python library to extract data from PDF with ``PyMuPDF``, parse layout with rule, and generate docx file with ``python-docx``. .. image:: https://s1.ax1x.com/2020/08/04/aDryx1.png .. toctree:: :maxdepth: 2 :caption: USER GUIDE installation quickstart techdoc .. toctree:: :maxdepth: 2 :caption: API DOCUMENTATION api/modules Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` pdf2docx-0.5.8/doc/installation.rst000066400000000000000000000016641455352127700172570ustar00rootroot00000000000000Installation ==================== ``pdf2docx`` can be installed from either Pypi or the source code. Install from Pypi ------------------- Type the command below for a new installation:: $ pip install pdf2docx Or, upgrade this library with:: $ pip install --upgrade pdf2docx Install from source code remotely -------------------------------------- Install ``pdf2docx`` directly from the ``master`` branch:: $ pip install git+git://github.com/dothinking/pdf2docx.git@master --upgrade .. note:: In this way, ``pdf2docx`` might have a higher version than Pypi, which is not released yet. Install from source code locally --------------------------------------- Clone or download `pdf2docx `_, navigate to the root directory and run:: $ python setup.py install Or, install it in developing mode:: $ python setup.py develop Uninstall -------------- :: $ pip uninstall pdf2docxpdf2docx-0.5.8/doc/quickstart.cli.rst000066400000000000000000000032651455352127700175150ustar00rootroot00000000000000Command Line Interface =========================== :: $ pdf2docx --help NAME pdf2docx - Command line interface for pdf2docx. SYNOPSIS pdf2docx COMMAND | - DESCRIPTION Command line interface for pdf2docx. COMMANDS COMMAND is one of the following: convert Convert pdf file to docx file. debug Convert one PDF page and plot layout information for debugging. table Extract table content from pdf pages. By range of pages ----------------------- Specify pages range by ``--start`` (from the first page if omitted) and ``--end`` (to the last page if omitted). .. note:: The page index is zero-based by default, but can turn it off by ``--zero_based_index=False``, i.e. the first page index starts from 1. Convert all pages:: $ pdf2docx convert test.pdf test.docx Convert pages from the second to the end:: $ pdf2docx convert test.pdf test.docx --start=1 Convert pages from the first to the third (index=2):: $ pdf2docx convert test.pdf test.docx --end=3 Convert second and third pages:: $ pdf2docx convert test.pdf test.docx --start=1 --end=3 Convert the first and second pages with zero-based index turn off:: $ pdf2docx convert test.pdf test.docx --start=1 --end=3 --zero_based_index=False By page numbers ----------------------- Convert the first, third and 5th pages:: $ pdf2docx convert test.pdf test.docx --pages=0,2,4 Multi-Processing -------------------------- Turn on multi-processing with default count of CPU:: $ pdf2docx convert test.pdf test.docx --multi_processing=True Specify the count of CPUs:: $ pdf2docx convert test.pdf test.docx --multi_processing=True --cpu_count=4pdf2docx-0.5.8/doc/quickstart.convert.rst000066400000000000000000000040711455352127700204220ustar00rootroot00000000000000Convert PDF ======================= We can use either the :py:class:`~pdf2docx.converter.Converter` class, or a wrapped method :py:meth:`~pdf2docx.main.parse` to convert all/specified pdf pages to docx. Multi-processing is supported in case pdf file with a large number of pages. Example 1: convert all pages ---------------------------------- :: from pdf2docx import Converter pdf_file = '/path/to/sample.pdf' docx_file = 'path/to/sample.docx' # convert pdf to docx cv = Converter(pdf_file) cv.convert(docx_file) # all pages by default cv.close() An alternative using ``parse`` method:: from pdf2docx import parse pdf_file = '/path/to/sample.pdf' docx_file = 'path/to/sample.docx' # convert pdf to docx parse(pdf_file, docx_file) Example 2: convert specified pages ---------------------------------------- * Specify pages range by ``start`` (from the first page if omitted) and ``end`` (to the last page if omitted):: # convert from the second page to the end (by default) cv.convert(docx_file, start=1) # convert from the first page (by default) to the third (end=3, excluded) cv.convert(docx_file, end=3) # convert from the second page and the third cv.convert(docx_file, start=1, end=3) * Alternatively, set separate pages by ``pages``:: # convert the first, third and 5th pages cv.convert(docx_file, pages=[0,2,4]) .. note:: Refer to :py:meth:`~pdf2docx.converter.Converter.convert` for detailed description on the input arguments. Example 3: multi-Processing -------------------------------- Turn on multi-processing with default count of CPU:: cv.convert(docx_file, multi_processing=True) Specify the count of CPUs:: cv.convert(docx_file, multi_processing=True, cpu_count=4) .. note:: Multi-processing works for continuous pages specified by ``start`` and ``end`` only. Example 4: convert encrypted pdf --------------------------------------- Provide ``password`` to open and convert password protected pdf:: cv = Converter(pdf_file, password) cv.convert(docx_file) cv.close() pdf2docx-0.5.8/doc/quickstart.gui.rst000066400000000000000000000003221455352127700175210ustar00rootroot00000000000000Graphic User Interface =========================== Thanks @JoHnTsIm providing a ``tkinter`` based user interface. To launch the GUI:: $ pdf2docx gui .. image:: https://z3.ax1x.com/2021/05/30/2ZYiUs.pngpdf2docx-0.5.8/doc/quickstart.rst000066400000000000000000000003541455352127700167430ustar00rootroot00000000000000Quickstart ============= ``pdf2docx`` can be used as either a Python library or a CLI tool. In addition, it has a simple GUI. .. toctree:: :maxdepth: 1 quickstart.convert quickstart.table quickstart.cli quickstart.guipdf2docx-0.5.8/doc/quickstart.table.rst000066400000000000000000000013511455352127700200270ustar00rootroot00000000000000Extract table ====================== :: from pdf2docx import Converter pdf_file = '/path/to/sample.pdf' cv = Converter(pdf_file) tables = cv.extract_tables(start=0, end=1) cv.close() for table in tables: print(table) The output may look like:: ... [['Input ', None, None, None, None, None], ['Description A ', 'mm ', '30.34 ', '35.30 ', '19.30 ', '80.21 '], ['Description B ', '1.00 ', '5.95 ', '6.16 ', '16.48 ', '48.81 '], ['Description C ', '1.00 ', '0.98 ', '0.94 ', '1.03 ', '0.32 '], ['Description D ', 'kg ', '0.84 ', '0.53 ', '0.52 ', '0.33 '], ['Description E ', '1.00 ', '0.15 ', None, None, None], ['Description F ', '1.00 ', '0.86 ', '0.37 ', '0.78 ', '0.01 ']]pdf2docx-0.5.8/doc/requirements.txt000066400000000000000000000001511455352127700172760ustar00rootroot00000000000000pdf2docx rst2pdf # define sphinx versioning sphinx==5.3.0 autodoc sphinx_rtd_theme sphinxcontrib.apidoc pdf2docx-0.5.8/doc/techdoc.rst000066400000000000000000000026241455352127700161640ustar00rootroot00000000000000Technical Documentation =========================== PDF文件遵循一定的格式规范,`PyMuPDF `_ 提供了便利的解析函数, 用于获取页面元素例如文本和形状及其位置。然后,基于元素间的相对位置关系解析内容,例如将“横纵线条 围绕着文本”解析为“表格”,将“文本下方的一条横线”解析为“文本下划线”。最后,借助 `python-docx `_ 将解析结果重建为docx格式的Word文档。 以下分篇介绍提取PDF页面数据、解析和重建docx过程中的具体细节: - 提取文本图片和形状_ - 解析页面布局_ - 解析表格_ - 解析段落_ .. _提取文本图片和形状: https://dothinking.github.io/2020-07-14-pdf2docx%E5%BC%80%E5%8F%91%E6%A6%82%E8%A6%81%EF%BC%9A%E6%8F%90%E5%8F%96%E6%96%87%E6%9C%AC%E3%80%81%E5%9B%BE%E7%89%87%E5%92%8C%E5%BD%A2%E7%8A%B6/ .. _解析页面布局: https://dothinking.github.io/2021-05-30-pdf2docx%E5%BC%80%E5%8F%91%E6%A6%82%E8%A6%81%EF%BC%9A%E8%A7%A3%E6%9E%90%E9%A1%B5%E9%9D%A2%E5%B8%83%E5%B1%80/ .. _解析表格: https://dothinking.github.io/2020-08-15-pdf2docx%E5%BC%80%E5%8F%91%E6%A6%82%E8%A6%81%EF%BC%9A%E8%A7%A3%E6%9E%90%E8%A1%A8%E6%A0%BC/ .. _解析段落: https://dothinking.github.io/2020-08-27-pdf2docx%E5%BC%80%E5%8F%91%E6%A6%82%E8%A6%81%EF%BC%9A%E8%A7%A3%E6%9E%90%E6%AE%B5%E8%90%BD/pdf2docx-0.5.8/pdf2docx/000077500000000000000000000000001455352127700147615ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/__init__.py000066400000000000000000000001241455352127700170670ustar00rootroot00000000000000from .converter import Converter from .page.Page import Page from .main import parsepdf2docx-0.5.8/pdf2docx/common/000077500000000000000000000000001455352127700162515ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/common/Block.py000066400000000000000000000110201455352127700176470ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Base class for text/image/table blocks. ''' from .share import BlockType, TextAlignment from .Element import Element class Block(Element): '''Base class for text/image/table blocks. Attributes: raw (dict): initialize object from raw properties. parent (optional): parent object that this block belongs to. ''' def __init__(self, raw:dict=None, parent=None): self._type = BlockType.UNDEFINED # horizontal spacing if raw is None: raw = {} self.alignment = self._get_alignment(raw.get('alignment', 0)) self.left_space = raw.get('left_space', 0.0) self.right_space = raw.get('right_space', 0.0) self.first_line_space = raw.get('first_line_space', 0.0) # RELATIVE position of tab stops self.tab_stops = raw.get('tab_stops', []) # vertical spacing self.before_space = raw.get('before_space', 0.0) self.after_space = raw.get('after_space', 0.0) self.line_space = raw.get('line_space', 0.0) self.line_space_type = raw.get('line_space_type', 1) # 0-exactly, 1-relatively super().__init__(raw, parent) @property def is_text_block(self): '''Whether test block.''' return self._type==BlockType.TEXT @property def is_inline_image_block(self): '''Whether inline image block.''' return self._type==BlockType.IMAGE @property def is_float_image_block(self): '''Whether float image block.''' return self._type==BlockType.FLOAT_IMAGE @property def is_image_block(self): '''Whether inline or float image block.''' return self.is_inline_image_block or self.is_float_image_block @property def is_text_image_block(self): '''Whether text block or inline image block.''' return self.is_text_block or self.is_inline_image_block @property def is_lattice_table_block(self): '''Whether lattice table (explicit table borders) block.''' return self._type==BlockType.LATTICE_TABLE @property def is_stream_table_block(self): '''Whether stream table (implied by table content) block.''' return self._type==BlockType.STREAM_TABLE @property def is_table_block(self): '''Whether table (lattice or stream) block.''' return self.is_lattice_table_block or self.is_stream_table_block def set_text_block(self): '''Set block type.''' self._type = BlockType.TEXT def set_inline_image_block(self): '''Set block type.''' self._type = BlockType.IMAGE def set_float_image_block(self): '''Set block type.''' self._type = BlockType.FLOAT_IMAGE def set_lattice_table_block(self): '''Set block type.''' self._type = BlockType.LATTICE_TABLE def set_stream_table_block(self): '''Set block type.''' self._type = BlockType.STREAM_TABLE def _get_alignment(self, mode:int): for t in TextAlignment: if t.value==mode: return t return TextAlignment.LEFT def parse_horizontal_spacing(self, bbox, *args): """Set left alignment, and calculate left space. Override by :obj:`pdf2docx.text.TextBlock`. Args: bbox (fitz.rect): boundary box of this block. """ # NOTE: in PyMuPDF CS, horizontal text direction is same with positive x-axis, # while vertical text is on the contrary, so use f = -1 here idx, f = (0, 1.0) if self.is_horizontal_text else (3, -1.0) self.alignment = TextAlignment.LEFT self.left_space = (self.bbox[idx] - bbox[idx]) * f def store(self): '''Store attributes in json format.''' res = super().store() res.update({ 'type' : self._type.value, 'alignment' : self.alignment.value, 'left_space' : self.left_space, 'right_space' : self.right_space, 'first_line_space' : self.first_line_space, 'before_space' : self.before_space, 'after_space' : self.after_space, 'line_space' : self.line_space, 'line_space_type' : self.line_space_type, 'tab_stops' : self.tab_stops }) return res def make_docx(self, *args, **kwargs): """Create associated docx element. Raises: NotImplementedError """ raise NotImplementedErrorpdf2docx-0.5.8/pdf2docx/common/Collection.py000066400000000000000000000303441455352127700207220ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''A group of instances, e.g. Blocks, Lines, Spans, Shapes. ''' import fitz from .Element import Element from .share import (IText, TextDirection) from .algorithm import (solve_rects_intersection, graph_bfs) class BaseCollection: '''Base collection representing a list of instances.''' def __init__(self, instances:list=None, parent=None): '''Init collection from a list of instances.''' self._parent = parent self._instances = [] self.extend(instances or []) # Note to exclude empty instance by default def __getitem__(self, idx): try: instances = self._instances[idx] except IndexError: msg = f'Collection index {idx} out of range.' raise IndexError(msg) else: return instances def __iter__(self): return (instance for instance in self._instances) def __len__(self): return len(self._instances) @property def parent(self): return self._parent @property def bbox(self): '''bbox of combined collection.''' rect = fitz.Rect() for instance in self._instances: rect |= instance.bbox return fitz.Rect([round(x,1) for x in rect]) # NOTE: round to avoid digital error def append(self, instance): if not instance: return self._instances.append(instance) def extend(self, instances:list): if not instances: return for instance in instances: self.append(instance) def reset(self, instances:list=None): """Reset instances list. Args: instances (list, optional): reset to target instances. Defaults to None. Returns: BaseCollection: self """ self._instances = [] self.extend(instances or []) return self def store(self): '''Store attributes in json format.''' return [ instance.store() for instance in self._instances ] def restore(self, *args, **kwargs): '''Construct Collection from a list of dict.''' raise NotImplementedError class Collection(BaseCollection, IText): '''Collection of instance focusing on grouping and sorting elements.''' @property def text_direction(self): '''Get text direction. All instances must have same text direction.''' res = set(instance.text_direction for instance in self._instances) return list(res)[0] if len(res)==1 else TextDirection.MIX def group(self, fun): """Group instances according to user defined criterion. Args: fun (function): with 2 arguments representing 2 instances (Element) and return bool. Returns: list: a list of grouped ``Collection`` instances. Examples 1:: # group instances intersected with each other fun = lambda a,b: a.bbox & b.bbox Examples 2:: # group instances aligned horizontally fun = lambda a,b: a.horizontally_aligned_with(b) .. note:: It's equal to a GRAPH searching problem, build adjacent list, and then search graph to find all connected components. """ # build adjacent list: # the i-th item is a set of indexes, which connected to the i-th instance. # NOTE: O(n^2) method, but it's acceptable (~0.2s) when n<1000 which is satisfied by page blocks num = len(self._instances) index_groups = [set() for i in range(num)] # type: list[set] for i, instance in enumerate(self._instances): # connections of current instance to all instances after it for j in range(i+1, num): if fun(instance, self._instances[j]): index_groups[i].add(j) index_groups[j].add(i) # search graph -> grouped index of instance groups = graph_bfs(index_groups) groups = [self.__class__([self._instances[i] for i in group]) for group in groups] return groups def group_by_connectivity(self, dx:float, dy:float): """Collect connected instances into same group. Args: dx (float): x-tolerances to define connectivity dy (float): y-tolerances to define connectivity Returns: list: a list of grouped ``Collection`` instances. .. note:: * It's equal to a GRAPH traversing problem, which the critical point in building the adjacent list, especially a large number of vertex (paths). * Checking intersections between paths is actually a Rectangle-Intersection problem, studied already in many literatures. """ # build the graph -> adjacent list: # the i-th item is a set of indexes, which connected to the i-th instance num = len(self._instances) index_groups = [set() for _ in range(num)] # type: list[set] # solve rectangle intersection problem i_rect_x, i = [], 0 d_rect = (-dx, -dy, dx, dy) for rect in self._instances: points = [a+b for a,b in zip(rect.bbox, d_rect)] # consider tolerance i_rect_x.append((i, points, points[0])) i_rect_x.append((i+1, points, points[2])) i += 2 i_rect_x.sort(key=lambda item: item[-1]) solve_rects_intersection(i_rect_x, 2*num, index_groups) # search graph -> grouped index of instance groups = graph_bfs(index_groups) groups = [self.__class__([self._instances[i] for i in group]) for group in groups] return groups def group_by_columns(self, factor:float=0.0, sorted:bool=True, text_direction:bool=False): '''Group elements into columns based on the bbox.''' # split in columns fun = lambda a,b: a.vertically_align_with(b, factor=factor, text_direction=text_direction) groups = self.group(fun) # increase in x-direction if sort if sorted: idx = 3 if text_direction and self.is_vertical_text else 0 groups.sort(key=lambda group: group.bbox[idx]) return groups def group_by_rows(self, factor:float=0.0, sorted:bool=True, text_direction:bool=False): '''Group elements into rows based on the bbox.''' # split in rows fun = lambda a,b: a.horizontally_align_with(b, factor=factor, text_direction=text_direction) groups = self.group(fun) # increase in y-direction if sort if sorted: idx = 0 if text_direction and self.is_vertical_text else 1 groups.sort(key=lambda group: group.bbox[idx]) return groups def group_by_physical_rows(self, sorted:bool=False, text_direction:bool=False): '''Group lines into physical rows.''' fun = lambda a,b: a.in_same_row(b) groups = self.group(fun) # increase in y-direction if sort if sorted: idx = 0 if text_direction and self.is_vertical_text else 1 groups.sort(key=lambda group: group.bbox[idx]) return groups def sort_in_reading_order(self): '''Sort collection instances in reading order (considering text direction), e.g. for normal reading direction: from top to bottom, from left to right. ''' if self.is_horizontal_text: self._instances.sort(key=lambda e: (e.bbox.y0, e.bbox.x0, e.bbox.x1)) else: self._instances.sort(key=lambda e: (e.bbox.x0, e.bbox.y1, e.bbox.y0)) return self def sort_in_line_order(self): '''Sort collection instances in a physical with text direction considered, e.g. for normal reading direction: from left to right. ''' if not self.is_vertical_text: self._instances.sort(key=lambda e: (e.bbox.x0, e.bbox.y0, e.bbox.x1)) else: self._instances.sort(key=lambda e: (e.bbox.y1, e.bbox.x0, e.bbox.y0)) return self def sort_in_reading_order_plus(self): '''Sort instances in reading order, especially for instances in same row. Taking natural reading direction for example: reading order for rows, from left to right for instances in row. In the following example, A comes before B:: +-----------+ +---------+ | | | A | | B | +---------+ +-----------+ Steps: * Sort elements in reading order, i.e. from top to bottom, from left to right. * Group elements in row. * Sort elements in row: from left to right. ''' instances = [] for row in self.group_by_physical_rows(sorted=True, text_direction=True): row.sort_in_line_order() instances.extend(row) self.reset(instances) class ElementCollection(Collection): '''Collection of ``Element`` instances.''' def _update_bbox(self, e:Element): '''Update parent bbox.''' if not self._parent is None: # Note: `if self._parent` does not work here self._parent.union_bbox(e) def append(self, e:Element): """Append an instance, update parent's bbox accordingly and set the parent of the added instance. Args: e (Element): instance to append. """ if not e: return self._instances.append(e) self._update_bbox(e) # set parent if not self._parent is None: e.parent = self._parent def insert(self, nth:int, e:Element): """Insert a Element and update parent's bbox accordingly. Args: nth (int): the position to insert. e (Element): the instance to insert. """ if not e: return self._instances.insert(nth, e) self._update_bbox(e) e.parent = self._parent # set parent def pop(self, nth:int): """Delete the ``nth`` instance. Args: nth (int): the position to remove. Returns: Collection: the removed instance. """ return self._instances.pop(nth) def is_flow_layout(self, line_separate_threshold:float, cell_layout=False): '''Whether contained elements are in flow layout or not.''' # float layout if vertical text but not cell layout, since vertical text # will be simulated with stream table if not cell_layout and self.is_vertical_text: return False # flow layout if single column only if len(self)<=1: return True if len(self.group_by_columns())>1: return False # group in physical row and check distance between lines idx0, idx1 = (0, 2) if self.is_horizontal_text else (3, 1) for row in self.group_by_physical_rows(text_direction=True): for i in range(1, len(row)): dis = abs(row[i].bbox[idx0]-row[i-1].bbox[idx1]) if dis >= line_separate_threshold: return False return True def contained_in_bbox(self, bbox): '''Filter instances contained in target bbox. Args: bbox (fitz.Rect): target boundary box. ''' instances = list(filter( lambda e: bbox.contains(e.bbox), self._instances)) return self.__class__(instances) def split_with_intersection(self, bbox:fitz.Rect, threshold:float=1e-3): """Split instances into two groups: one intersects with ``bbox``, the other not. Args: bbox (fitz.Rect): target rect box. threshold (float): It's intersected when the overlap rate exceeds this threshold. Defaults to 0. Returns: tuple: two group in original class type. """ intersections, no_intersections = [], [] for instance in self._instances: # A contains B => A & B = B intersection = instance.bbox & bbox if intersection.is_empty: no_intersections.append(instance) else: factor = round(intersection.get_area()/instance.bbox.get_area(), 2) if factor >= threshold: intersections.append(instance) else: no_intersections.append(instance) return self.__class__(intersections), self.__class__(no_intersections) pdf2docx-0.5.8/pdf2docx/common/Element.py000066400000000000000000000251531455352127700202220ustar00rootroot00000000000000'''Object with a bounding box, e.g. Block, Line, Span. Based on ``PyMuPDF``, the coordinates (e.g. bbox of ``page.get_text('rawdict')``) are generally provided relative to the un-rotated page; while this ``pdf2docx`` library works under real page coordinate system, i.e. with rotation considered. So, any instances created by this Class are always applied a rotation matrix automatically. Therefore, the bbox parameter used to create ``Element`` instance MUST be relative to un-rotated CS. If final coordinates are provided, should update it after creating an empty object:: Element().update_bbox(final_bbox) .. note:: An exception is ``page.get_drawings()``, the coordinates are converted to real page CS already. ''' import copy import fitz from .share import IText from . import constants class Element(IText): '''Boundary box with attribute in fitz.Rect type.''' # all coordinates are related to un-rotated page in PyMuPDF # e.g. Matrix(0.0, 1.0, -1.0, 0.0, 842.0, 0.0) ROTATION_MATRIX = fitz.Matrix(0.0) # rotation angle = 0 degree by default @classmethod def set_rotation_matrix(cls, rotation_matrix): """Set global rotation matrix. Args: Rotation_matrix (fitz.Matrix): target matrix """ if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix): cls.ROTATION_MATRIX = rotation_matrix @classmethod def pure_rotation_matrix(cls): '''Pure rotation matrix used for calculating text direction after rotation.''' a,b,c,d,e,f = cls.ROTATION_MATRIX return fitz.Matrix(a,b,c,d,0,0) def __init__(self, raw:dict=None, parent=None): ''' Initialize Element and convert to the real (rotation considered) page CS.''' self.bbox = fitz.Rect() # type: fitz.Rect self._parent = parent # type: Element # NOTE: Any coordinates provided in raw is in original page CS # (without considering page rotation). if 'bbox' in (raw or {}): rect = fitz.Rect(raw['bbox']) * Element.ROTATION_MATRIX self.update_bbox(rect) def __bool__(self): '''Real object when bbox is defined.''' # NOTE inconsistent results of fitz.Rect for different version of pymupdf, e.g., # a = fitz.Rect(3,3,2,2) # bool(a) a.get_area() a.is_empty # pymupdf 1.23.5 True 1.0 True # pymupdf 1.23.8 True 0.0 True # bool(fitz.Rect())==False # NOTE: do not use `return not self.bbox.is_empty` here return bool(self.bbox) def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})' # ------------------------------------------------ # parent element # ------------------------------------------------ @property def parent(self): return self._parent @parent.setter def parent(self, parent): self._parent = parent # ------------------------------------------------ # bbox operations # ------------------------------------------------ def copy(self): '''make a deep copy.''' # NOTE: can't serialize data because parent is an Object, # so set it None in advance. parent, self.parent = self._parent, None obj = copy.deepcopy(self) self._parent = parent # set back parent return obj def get_expand_bbox(self, dt:float): """Get expanded bbox with margin in both x- and y- direction. Args: dt (float): Expanding margin. Returns: fitz.Rect: Expanded bbox. .. note:: This method creates a new bbox, rather than changing the bbox of itself. """ return self.bbox + (-dt, -dt, dt, dt) def update_bbox(self, rect): '''Update current bbox to specified ``rect``. Args: rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)``, in real page CS (with rotation considered). ''' self.bbox = fitz.Rect([round(x,1) for x in rect]) return self def union_bbox(self, e): """Update current bbox to the union with specified Element. Args: e (Element): The target to get union Returns: Element: self """ return self.update_bbox(self.bbox | e.bbox) # -------------------------------------------- # location relationship to other Element instance # -------------------------------------------- def contains(self, e:'Element', threshold:float=1.0): """Whether given element is contained in this instance, with margin considered. Args: e (Element): Target element threshold (float, optional): Intersection rate. Defaults to 1.0. The larger, the stricter. Returns: bool: [description] """ S = e.bbox.get_area() if not S: return False # it's not practical to set a general threshold to consider the margin, so two steps: # - set a coarse but acceptable area threshold, # - check the length in main direction strictly # A contains B => A & B = B intersection = self.bbox & e.bbox factor = round(intersection.get_area()/S, 2) if factor= self.bbox.height: return self.bbox.width+constants.MINOR_DIST >= e.bbox.width return self.bbox.height+constants.MINOR_DIST >= e.bbox.height def get_main_bbox(self, e, threshold:float=0.95): """If the intersection with ``e`` exceeds the threshold, return the union of these two elements; else return None. Args: e (Element): Target element. threshold (float, optional): Intersection rate. Defaults to 0.95. Returns: fitz.Rect: Union bbox or None. """ bbox_1 = self.bbox bbox_2 = e.bbox if hasattr(e, 'bbox') else fitz.Rect(e) # areas b = bbox_1 & bbox_2 if b.is_empty: return None # no intersection # Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.get_area()=0 # so give a small value when they're intersected but the area is zero a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area() factor = a/min(a1,a2) if a else 1e-6 return bbox_1 | bbox_2 if factor >= threshold else None def vertically_align_with(self, e, factor:float=0.0, text_direction:bool=True): '''Check whether two Element instances have enough intersection in vertical direction, i.e. perpendicular to reading direction. Args: e (Element): Object to check with factor (float, optional): Threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. text_direction (bool, optional): Consider text direction or not. True by default. Returns: bool: [description] Examples:: +--------------+ | | +--------------+ L1 +-------------------+ | | +-------------------+ L2 An enough intersection is defined based on the minimum width of two boxes:: L1+L2-L>factor*min(L1,L2) ''' if not e or not bool(self): return False # text direction idx = 1 if text_direction and self.is_vertical_text else 0 L1 = self.bbox[idx+2]-self.bbox[idx] L2 = e.bbox[idx+2]-e.bbox[idx] L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx]) eps = 1e-3 # tolerant return L1+L2-L+eps >= factor*min(L1,L2) def horizontally_align_with(self, e, factor:float=0.0, text_direction:bool=True): '''Check whether two Element instances have enough intersection in horizontal direction, i.e. along the reading direction. Args: e (Element): Element to check with factor (float, optional): threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. text_direction (bool, optional): consider text direction or not. True by default. Examples:: +--------------+ | | L1 +--------------------+ +--------------+ | | L2 +--------------------+ An enough intersection is defined based on the minimum width of two boxes:: L1+L2-L>factor*min(L1,L2) ''' if not e or not bool(self): return False # text direction idx = 0 if text_direction and self.is_vertical_text else 1 L1 = self.bbox[idx+2]-self.bbox[idx] L2 = e.bbox[idx+2]-e.bbox[idx] L = max(self.bbox[idx+2], e.bbox[idx+2]) - min(self.bbox[idx], e.bbox[idx]) eps = 1e-3 # tolerant return L1+L2-L+eps >= factor*min(L1,L2) def in_same_row(self, e): """Check whether in same row/line with specified Element instance. With text direction considered. Taking horizontal text as an example: * yes: the bottom edge of each box is lower than the centerline of the other one; * otherwise, not in same row. Args: e (Element): Target object. .. note:: The difference to method ``horizontally_align_with``: they may not in same line, though aligned horizontally. """ if not e or self.is_horizontal_text != e.is_horizontal_text: return False # normal reading direction by default idx = 1 if self.is_horizontal_text else 0 c1 = (self.bbox[idx] + self.bbox[idx+2]) / 2.0 c2 = (e.bbox[idx] + e.bbox[idx+2]) / 2.0 res = c1<=e.bbox[idx+2] and c2<=self.bbox[idx+2] # Note y direction under PyMuPDF context return res # ------------------------------------------------ # others # ------------------------------------------------ def store(self): '''Store properties in raw dict.''' return { 'bbox': tuple(x for x in self.bbox) } def plot(self, page, stroke:tuple=(0,0,0), width:float=0.5, fill:tuple=None, dashes:str=None): '''Plot bbox in PDF page for debug purpose.''' page.draw_rect(self.bbox, color=stroke, fill=fill, width=width, dashes=dashes, overlay=False, fill_opacity=0.5) pdf2docx-0.5.8/pdf2docx/common/__init__.py000066400000000000000000000000001455352127700203500ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/common/algorithm.py000066400000000000000000000357201455352127700206200ustar00rootroot00000000000000from collections import deque import numpy as np import cv2 as cv # ------------------------------------------------------------------------------------------- # Intersection area of two iso-oriented rectangles # ------------------------------------------------------------------------------------------- def get_area(bbox_1:tuple, bbox_2:tuple): x0, y0, x1, y1 = bbox_1 u0, v0, u1, v1 = bbox_2 # width of intersected area w = (x1-x0) + (u1-u0) - (max(x1, u1)-min(x0, u0)) if w<=0: return 0 # height of intersected area h = (y1-y0) + (v1-v0) - (max(y1, v1)-min(y0, v0)) if h<=0: return 0 return w*h # ------------------------------------------------------------------------------------------- # Breadth First Search method for graph # ------------------------------------------------------------------------------------------- def graph_bfs(graph): '''Breadth First Search graph (may be disconnected graph). Args: graph (list): GRAPH represented by adjacent list, [set(1,2,3), set(...), ...] Returns: list: A list of connected components ''' # search graph # NOTE: generally a disconnected graph counted_indexes = set() # type: set[int] groups = [] for i in range(len(graph)): if i in counted_indexes: continue # connected component starts... indexes = set(_graph_bfs_from_node(graph, i)) groups.append(indexes) counted_indexes.update(indexes) return groups def _graph_bfs_from_node(graph, start): '''Breadth First Search connected graph with start node. Args: graph (list): GRAPH represented by adjacent list, [set(1,2,3), set(...), ...]. start (int): Index of any start vertex. ''' search_queue = deque() searched = set() search_queue.append(start) while search_queue: cur_node = search_queue.popleft() if cur_node in searched: continue yield cur_node searched.add(cur_node) for node in graph[cur_node]: search_queue.append(node) # ------------------------------------------------------------------------------------------- # Implementation of solving Rectangle-Intersection Problem according to algorithm proposed in # paper titled "A Rectangle-Intersection Algorithm with Limited Resource Requirements". # https://ieeexplore.ieee.org/document/5578313 # # - Input # The rectangle is represented by its corner points, (x0, y0, x1, y1) # # - Output # The output is an Adjacent List of each rect, which could be used to initialize a GRAPH. # ------------------------------------------------------------------------------------------- # procedure report(S, n) # 1 Let V be the list of x-coordinates of the 2n vertical edges in S sorted in non-decreasing order. # 2 Let H be the list of n y-intervals corresponding to the bottom and top y-coordinates of each rectangle. # 3 Sort the elements of H in non-decreasing order by their bottom y-coordinates. # 4 Call procedure detect(V, H, 2n). def solve_rects_intersection(V:list, num:int, index_groups:list): '''Implementation of solving Rectangle-Intersection Problem. Performance:: O(nlog n + k) time and O(n) space, where k is the count of intersection pairs. Args: V (list): Rectangle-related x-edges data, [(index, Rect, x), (...), ...]. num (int): Count of V instances, equal to len(V). index_groups (list): Target adjacent list for connectivity between rects. Procedure ``detect(V, H, m)``:: if m < 2 then return else - let V1 be the first ⌊m/2⌋ and let V2 be the rest of the vertical edges in V in the sorted order; - let S11 and S22 be the set of rectangles represented only in V1 and V2 but not spanning V2 and V1, respectively; - let S12 be the set of rectangles represented only in V1 and spanning V2; - let S21 be the set of rectangles represented only in V2 and spanning V1 - let H1 and H2 be the list of y-intervals corresponding to the elements of V1 and V2 respectively - stab(S12, S22); stab(S21, S11); stab(S12, S21) - detect(V1, H1, ⌊m/2⌋); detect(V2, H2, m − ⌊m/2⌋) ''' if num < 2: return # start/end points of left/right intervals center_pos = int(num/2.0) X0, X, X1 = V[0][-1], V[center_pos-1][-1], V[-1][-1] # split into two groups left = V[0:center_pos] right = V[center_pos:] # filter rects according to their position to each intervals S11 = list(filter( lambda item: item[1][2]<=X, left )) S12 = list(filter( lambda item: item[1][2]>=X1, left )) S22 = list(filter( lambda item: item[1][0]>X, right )) S21 = list(filter( lambda item: item[1][0]<=X0, right )) # intersection in x-direction is fulfilled, so check y-direction further _stab(S12, S22, index_groups) _stab(S21, S11, index_groups) _stab(S12, S21, index_groups) # recursive process solve_rects_intersection(left, center_pos, index_groups) solve_rects_intersection(right, num-center_pos, index_groups) def _stab(S1:list, S2:list, index_groups:list): '''Check interval intersection in y-direction. Procedure ``stab(A, B)``:: i := 1; j := 1 while i ≤ |A| and j ≤ |B| if ai.y0 < bj.y0 then k := j while k ≤ |B| and bk.y0 < ai.y1 reportPair(air, bks) k := k + 1 i := i + 1 else k := i while k ≤ |A| and ak.y0 < bj.y1 reportPair(bjs, akr) k := k + 1 j := j + 1 ''' if not S1 or not S2: return # sort S1.sort(key=lambda item: item[1][1]) S2.sort(key=lambda item: item[1][1]) i, j = 0, 0 while i 1 for c0, c1 in zip(arr_x0, arr_x1): y_arr = arr[r0:r1, c0:c1] top_left = (x0+c0, y0+r0) xy_cut(y_arr, top_left, res, min_w, min_h, min_dx, min_dy) # do xy-cut recursively res = [] xy_cut(arr=img_binary, top_left=(0, 0), res=res, min_w=min_w, min_h=min_h, min_dx=min_dx, min_dy=min_dy) return res def _split_projection_profile(arr_values:np.array, min_value:float, min_gap:float): '''Split projection profile: ``` ┌──┐ arr_values │ │ ┌─┐─── ┌──┐ │ │ │ │ | │ │ │ │ ┌───┐ │ │min_value │ │<- min_gap ->│ │ │ │ │ │ | ────┴──┴─────────────┴──┴─┴───┴─┴─┴─┴─── 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ``` Args: arr_values (np.array): 1-d array representing the projection profile. min_value (float): Ignore the profile if `arr_value` is less than `min_value`. min_gap (float): Ignore the gap if less than this value. Returns: tuple: Start indexes and end indexes of split groups. ''' # all indexes with projection height exceeding the threshold arr_index = np.where(arr_values>min_value)[0] if not len(arr_index): return # find zero intervals between adjacent projections # | | || # ||||<- zero-interval -> ||||| arr_diff = arr_index[1:] - arr_index[0:-1] arr_diff_index = np.where(arr_diff>min_gap)[0] arr_zero_intvl_start = arr_index[arr_diff_index] arr_zero_intvl_end = arr_index[arr_diff_index+1] # convert to index of projection range: # the start index of zero interval is the end index of projection arr_start = np.insert(arr_zero_intvl_end, 0, arr_index[0]) arr_end = np.append(arr_zero_intvl_start, arr_index[-1]) arr_end += 1 # end index will be excluded as index slice return arr_start, arr_end def inner_contours(img_binary:np.array, bbox:tuple, min_w:float, min_h:float): '''Inner contours of current region, especially level 2 contours of the default opencv tree hirerachy. Args: img_binary (np.array): Binarized image with interesting region (255) and empty region (0). bbox (tuple): The external bbox. min_w (float): Ignore contours if the bbox width is less than this value. min_h (float): Ignore contours if the bbox height is less than this value. Returns: list: A list of bbox-es of inner contours. ''' # find both external and inner contours of current region x0, y0, x1, y1 = bbox arr = np.zeros(img_binary.shape, dtype=np.uint8) arr[y0:y1, x0:x1] = img_binary[y0:y1, x0:x1] contours, hierarchy = cv.findContours(arr, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE) # check first three level contours: # * level-0, i.e. table bbox # * level-1, i.e. cell bbox # * level-2, i.e. region within cell # NOTE: only one dimension, i.e. the second, to be decided, so the # return value of np.where is a len==1 tuple level_0 = np.where(hierarchy[0,:,3]==-1)[0] level_1 = np.where(np.isin(hierarchy[0,:,3], level_0))[0] level_2 = np.where(np.isin(hierarchy[0,:,3], level_1))[0] # In general, we focus on only level 2, but considering edge case: level 2 contours # might be counted as level 1 incorrectly, e.g. test/samples/demo-table-close-underline.pdf. # So, get first the concerned level 1 contours, i.e. those contained by other level 1 contour. def contains(bbox1, bbox2): x0, y0, x1, y1 = bbox1 u0, v0, u1, v1 = bbox2 return u0>=x0 and v0>=y0 and u1<=x1 and v1<=y1 level_1_bbox_list, res_level_1, res = [], [], [] for i in level_1: x, y, w, h = cv.boundingRect(contours[i]) if w 1.0.0 from docx.oxml import OxmlElement, parse_xml, register_element_cls except ImportError: # python-docx >= 1.0.0 from docx.oxml.parser import OxmlElement, parse_xml, register_element_cls from docx.oxml.ns import qn, nsdecls from docx.oxml.shape import CT_Picture from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne from docx.enum.text import WD_COLOR_INDEX from docx.image.exceptions import UnrecognizedImageError from docx.table import _Cell from docx.opc.constants import RELATIONSHIP_TYPE from .share import rgb_value # --------------------------------------------------------- # section and paragraph # --------------------------------------------------------- def set_equal_columns(section, num=2, space=0): """Set section column count and space. All the columns have same width. Args: section : ``python-docx`` Section instance. num (int): Column count. Defaults to 2. space (int, optional): Space between adjacent columns. Unit: Pt. Defaults to 0. """ col = section._sectPr.xpath('./w:cols')[0] col.set(qn('w:num'), str(num)) col.set(qn('w:space'), str(int(20*space))) # basic unit 1/20 Pt def set_columns(section, width_list:list, space=0): """Set section column count and space. Args: section : ``python-docx`` Section instance. width_list (list|tuple): Width of each column. space (int, optional): Space between adjacent columns. Unit: Pt. Defaults to 0. Scheme:: """ cols = section._sectPr.xpath('./w:cols')[0] # do nothing if only one column if len(width_list)==1: # recovery to default column setting in case previous section has multiple columns if len(cols)!=1: cols.clear() return # set multiple column properties # clear columns in advance because the latest column setting seems to be inherited cols.clear() cols.set(qn('w:num'), str(len(width_list))) cols.set(qn('w:equalWidth'), '0') # insert column with width for w in width_list: e = OxmlElement('w:col') e.set(qn('w:w'), str(int(20*w))) e.set(qn('w:space'), str(int(20*space))) # basic unit 1/20 Pt cols.append(e) def delete_paragraph(paragraph): '''Delete a paragraph. Reference: https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 ''' p = paragraph._element p.getparent().remove(p) paragraph._p = paragraph._element = None def reset_paragraph_format(p, line_spacing:float=1.05): '''Reset paragraph format, especially line spacing. Two kinds of line spacing, corresponding to the setting in MS Office Word: * line_spacing=1.05: single or multiple * line_spacing=Pt(1): exactly Args: p (Paragraph): ``python-docx`` paragraph instance. line_spacing (float, optional): Line spacing. Defaults to 1.05. Returns: paragraph_format: Paragraph format. ''' pf = p.paragraph_format pf.line_spacing = line_spacing # single by default pf.space_before = Pt(0) pf.space_after = Pt(0) pf.left_indent = Pt(0) pf.right_indent = Pt(0) pf.widow_control = True # do not adjust spacing between Chinese and Latin/number xml = r''.format(nsdecls('w')) p._p.get_or_add_pPr().insert(0, parse_xml(xml)) xml = r''.format(nsdecls('w')) p._p.get_or_add_pPr().insert(0, parse_xml(xml)) return pf def set_hidden_property(p): '''Hide paragraph. This method just sets the paragraph property, while the added text must be hided explicitly. r = p.add_run() r.text = "Hidden" r.font.hidden = True Args: p (Paragraph): python-docx created paragraph. ''' pPr = OxmlElement('w:pPr') # paragraph property rPr = OxmlElement('w:rPr') # run property v = OxmlElement('w:vanish') # hidden rPr.append(v) pPr.append(rPr) p._p.append(rPr) # --------------------------------------------------------- # text properties # --------------------------------------------------------- def set_char_scaling(p_run, scale:float=1.0): '''Set character spacing: scaling. Manual operation in MS Word: Font | Advanced | Character Spacing | Scaling. Args: p_run (docx.text.run.Run): Proxy object wrapping element. scale (float, optional): scaling factor. Defaults to 1.0. ''' p_run._r.get_or_add_rPr().insert(0, parse_xml(r''.format(nsdecls('w'), 100*scale))) def set_char_spacing(p_run, space:float=0.0): '''Set character spacing. Manual operation in MS Word: Font | Advanced | Character Spacing | Spacing. Args: p_run (docx.text.run.Run): Proxy object wrapping element. space (float, optional): Spacing value in Pt. Expand if positive else condense. Defaults to 0.0. ''' p_run._r.get_or_add_rPr().insert(0, parse_xml(r''.format(nsdecls('w'), 20*space))) def set_char_shading(p_run, srgb:int): '''Set character shading color, in case the color is out of highlight color scope. Reference: http://officeopenxml.com/WPtextShading.php Args: p_run (docx.text.run.Run): Proxy object wrapping element. srgb (int): Color value. ''' # try to set highlight first using python-docx built-in method # Here give 6/16 of the valid highlight colors color_map = { rgb_value((1,0,0)): WD_COLOR_INDEX.RED, rgb_value((0,1,0)): WD_COLOR_INDEX.BRIGHT_GREEN, rgb_value((0,0,1)): WD_COLOR_INDEX.BLUE, rgb_value((1,1,0)): WD_COLOR_INDEX.YELLOW, rgb_value((1,0,1)): WD_COLOR_INDEX.PINK, rgb_value((0,1,1)): WD_COLOR_INDEX.TURQUOISE } if srgb in color_map: p_run.font.highlight_color = color_map[srgb] # set char shading else: c = hex(srgb)[2:].zfill(6) xml = r''.format(nsdecls('w'), c) p_run._r.get_or_add_rPr().insert(0, parse_xml(xml)) def set_char_underline(p_run, srgb:int): '''Set underline and color. Args: p_run (docx.text.run.Run): Proxy object wrapping element. srgb (int): Color value. ''' c = hex(srgb)[2:].zfill(6) xml = r''.format(nsdecls('w'), c) p_run._r.get_or_add_rPr().insert(0, parse_xml(xml)) def add_hyperlink(paragraph, url, text): """Create a hyperlink within a paragraph object. Reference: https://github.com/python-openxml/python-docx/issues/74#issuecomment-215678765 Args: paragraph (Paragraph): ``python-docx`` paragraph adding the hyperlink to. url (str): The required url. text (str): The text displayed for the url. Returns: Run: A Run object containing the hyperlink. """ # This gets access to the document.xml.rels file and gets a new relation id value part = paragraph.part r_id = part.relate_to(url, RELATIONSHIP_TYPE.HYPERLINK, is_external=True) # Create the w:hyperlink tag and add needed values hyperlink = OxmlElement('w:hyperlink') hyperlink.set(qn('r:id'), r_id, ) hyperlink.set(qn('w:history'), '1') # Create a w:r element new_run = OxmlElement('w:r') # Create a new w:rPr element rPr = OxmlElement('w:rPr') # Create a w:rStyle element, note this currently does not add the hyperlink style as its not in # the default template, I have left it here in case someone uses one that has the style in it rStyle = OxmlElement('w:rStyle') rStyle.set(qn('w:val'), 'Hyperlink') # Join all the xml elements together add add the required text to the w:r element rPr.append(rStyle) new_run.append(rPr) new_run.text = text hyperlink.append(new_run) # Create a new Run object and add the hyperlink into it r = paragraph.add_run() r._r.append(hyperlink) return r # --------------------------------------------------------- # image properties # --------------------------------------------------------- def add_image(p, image_path_or_stream, width, height): ''' Add image to paragraph. Args: p (Paragraph): ``python-docx`` paragraph instance. image_path_or_stream (str, bytes): Image path or stream. width (float): Image width in Pt. height (float): Image height in Pt. ''' docx_span = p.add_run() try: docx_span.add_picture(image_path_or_stream, width=Pt(width), height=Pt(height)) except UnrecognizedImageError: print('Unrecognized Image.') return # exactly line spacing will destroy image display, so set single line spacing instead p.paragraph_format.line_spacing = 1.00 class _CT_Anchor(BaseOxmlElement): """ ```` element, container for a floating image. """ extent = OneAndOnlyOne('wp:extent') docPr = OneAndOnlyOne('wp:docPr') graphic = OneAndOnlyOne('a:graphic') @classmethod def new(cls, cx, cy, shape_id, pic, pos_x, pos_y): """ Return a new ```` element populated with the values passed as parameters. """ anchor = parse_xml(cls._anchor_xml(pos_x, pos_y)) anchor.extent.cx = cx anchor.extent.cy = cy anchor.docPr.id = shape_id anchor.docPr.name = 'Picture %d' % shape_id anchor.graphic.graphicData.uri = ( 'http://schemas.openxmlformats.org/drawingml/2006/picture' ) anchor.graphic.graphicData._insert_pic(pic) return anchor @classmethod def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y): """ Return a new `wp:anchor` element containing the `pic:pic` element specified by the argument values. """ pic_id = 0 # Word doesn't seem to use this, but does not omit it pic = CT_Picture.new(pic_id, filename, rId, cx, cy) anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y) anchor.graphic.graphicData._insert_pic(pic) return anchor @classmethod def _anchor_xml(cls, pos_x, pos_y): return ( '\n' ' \n' ' \n' ' %d\n' ' \n' ' \n' ' %d\n' ' \n' ' \n' ' \n' ' \n' ' \n' ' \n' ' \n' ' \n' ' \n' ' \n' '' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) ) ) register_element_cls('wp:anchor', _CT_Anchor) def add_float_image(p, image_path_or_stream, width, pos_x=None, pos_y=None): '''Add float image behind text. Args: p (Paragraph): ``python-docx`` Paragraph object this picture belongs to. image_path_or_stream (str, bytes): Image path or stream. width (float): Displaying width of picture, in unit Pt. pos_x (float): X-position (English Metric Units) to the top-left point of page valid region pos_y (float): Y-position (English Metric Units) to the top-left point of page valid region ''' run = p.add_run() # parameters for picture, e.g. id, name rId, image = run.part.get_or_add_image(image_path_or_stream) cx, cy = image.scaled_dimensions(Pt(width), None) shape_id, filename = run.part.next_id, image.filename anchor = _CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, Pt(pos_x), Pt(pos_y)) run._r.add_drawing(anchor) # --------------------------------------------------------- # table properties # --------------------------------------------------------- def indent_table(table, indent:float): '''Indent a table. Args: table (Table): ``python-docx`` Table object. indent (float): Indent value, the basic unit is 1/20 pt. ''' tbl_pr = table._element.xpath('w:tblPr') if tbl_pr: e = OxmlElement('w:tblInd') e.set(qn('w:w'), str(20*indent)) # basic unit 1/20 pt for openxml e.set(qn('w:type'), 'dxa') tbl_pr[0].append(e) def set_cell_margins(cell:_Cell, **kwargs): '''Set cell margins. Provided values are in twentieths of a point (1/1440 of an inch). Reference: * https://blog.csdn.net/weixin_44312186/article/details/104944773 * http://officeopenxml.com/WPtableCellMargins.php Args: cell (_Cell): ``python-docx`` Cell instance you want to modify. kwargs (dict): Dict with keys: top, bottom, start, end. Usage:: set_cell_margins(cell, top=50, start=50, bottom=50, end=50) ''' tc = cell._tc tcPr = tc.get_or_add_tcPr() tcMar = OxmlElement('w:tcMar') for m in ['top', 'start', 'bottom', 'end']: if m in kwargs: node = OxmlElement("w:{}".format(m)) node.set(qn('w:w'), str(kwargs.get(m))) node.set(qn('w:type'), 'dxa') tcMar.append(node) tcPr.append(tcMar) def set_cell_shading(cell:_Cell, srgb:int): '''Set cell background-color. Reference: https://stackoverflow.com/questions/26752856/python-docx-set-table-cell-background-and-text-color Args: cell (_Cell): ``python-docx`` Cell instance you want to modify srgb (int): RGB color value. ''' c = hex(srgb)[2:].zfill(6) cell._tc.get_or_add_tcPr().append(parse_xml(r''.format(nsdecls('w'), c))) def set_cell_border(cell:_Cell, **kwargs): '''Set cell`s border. Reference: * https://stackoverflow.com/questions/33069697/how-to-setup-cell-borders-with-python-docx * https://blog.csdn.net/weixin_44312186/article/details/104944110 Args: cell (_Cell): ``python-docx`` Cell instance you want to modify. kwargs (dict): Dict with keys: top, bottom, start, end. Usage:: set_cell_border( cell, top={"sz": 12, "val": "single", "color": "#FF0000", "space": "0"}, bottom={"sz": 12, "color": "#00FF00", "val": "single"}, start={"sz": 24, "val": "dashed", "shadow": "true"}, end={"sz": 12, "val": "dashed"}, ) ''' tc = cell._tc tcPr = tc.get_or_add_tcPr() # check for tag existence, if none found, then create one tcBorders = tcPr.first_child_found_in("w:tcBorders") if tcBorders is None: tcBorders = OxmlElement('w:tcBorders') tcPr.append(tcBorders) # list over all available tags for edge in ('start', 'top', 'end', 'bottom', 'insideH', 'insideV'): edge_data = kwargs.get(edge) if edge_data: tag = 'w:{}'.format(edge) # check for tag existence, if none found, then create one element = tcBorders.find(qn(tag)) if element is None: element = OxmlElement(tag) tcBorders.append(element) # looks like order of attributes is important for key in ["sz", "val", "color", "space", "shadow"]: if key in edge_data: element.set(qn('w:{}'.format(key)), str(edge_data[key])) def set_vertical_cell_direction(cell:_Cell, direction:str='btLr'): '''Set vertical text direction for cell. Reference: https://stackoverflow.com/questions/47738013/how-to-rotate-text-in-table-cells Args: direction (str): Either "tbRl" (top to bottom) or "btLr" (bottom to top). ''' tc = cell._tc tcPr = tc.get_or_add_tcPr() textDirection = OxmlElement('w:textDirection') textDirection.set(qn('w:val'), direction) # btLr tbRl tcPr.append(textDirection) pdf2docx-0.5.8/pdf2docx/common/share.py000066400000000000000000000157461455352127700177420ustar00rootroot00000000000000'''Common methods.''' from enum import Enum import random from collections.abc import Iterable from fitz.utils import getColorList, getColorInfoList class BlockType(Enum): '''Block types.''' UNDEFINED = -1 TEXT = 0 IMAGE = 1 LATTICE_TABLE = 2 STREAM_TABLE = 3 FLOAT_IMAGE = 4 class RectType(Enum): '''Shape type in context.''' HIGHLIGHT = 1 UNDERLINE = 1<<1 STRIKE = 1<<2 HYPERLINK = 1<<3 BORDER = 1<<4 SHADING = 1<<5 class TextDirection(Enum): '''Text direction. * LEFT_RIGHT: from left to right within a line, and lines go from top to bottom * BOTTOM_TOP: from bottom to top within a line, and lines go from left to right * MIX : a mixture if LEFT_RIGHT and BOTTOM_TOP * IGNORE : neither LEFT_RIGHT nor BOTTOM_TOP ''' IGNORE = -1 LEFT_RIGHT = 0 BOTTOM_TOP = 1 MIX = 2 class TextAlignment(Enum): '''Text alignment. .. note:: The difference between ``NONE`` and ``UNKNOWN``: * NONE: none of left/right/center align -> need TAB stop * UNKNOWN: can't decide, e.g. single line only ''' NONE = -1 UNKNOWN = 0 LEFT = 1 CENTER = 2 RIGHT = 3 JUSTIFY = 4 class IText: '''Text related interface considering text direction.''' @property def text_direction(self): '''Text direction is from left to right by default.''' return TextDirection.LEFT_RIGHT @property def is_horizontal_text(self): '''Check whether text direction is from left to right.''' return self.text_direction == TextDirection.LEFT_RIGHT or \ self.text_direction == TextDirection.MIX @property def is_vertical_text(self): '''Check whether text direction is from bottom to top.''' return self.text_direction == TextDirection.BOTTOM_TOP @property def is_mix_text(self): '''Check whether text direction is either from left to right or from bottom to top.''' return self.text_direction == TextDirection.MIX class lazyproperty: '''Calculate only once and cache property value.''' def __init__(self, func): self.func = func def __get__(self, instance, cls): if instance is None: return self else: value = self.func(instance) setattr(instance, self.func.__name__, value) return value # ------------------------- # methods # ------------------------- def is_number(str_number): '''Whether can be converted to a float.''' try: float(str_number) except ValueError: return False else: return True def flatten(items, klass): '''Yield items from any nested iterable.''' for item in items: if isinstance(item, Iterable) and not isinstance(item, klass): yield from flatten(item, klass) else: yield item def lower_round(number:float, ndigits:int=0): '''Round number to lower bound with specified digits, e.g. lower_round(1.26, 1)=1.2''' n = 10.0**ndigits return int(n*number) / n def decode(s:str): '''Try to decode a unicode string.''' b = bytes(ord(c) for c in s) for encoding in ['utf-8', 'gbk', 'gb2312', 'iso-8859-1']: try: res = b.decode(encoding) break except: continue return res # ------------------------- # color methods # ------------------------- def rgb_component_from_name(name:str=''): '''Get a named RGB color (or random color) from fitz predefined colors, e.g. 'red' -> (1.0,0.0,0.0).''' # get color index if name and name.upper() in getColorList(): pos = getColorList().index(name.upper()) else: pos = random.randint(0, len(getColorList())-1) c = getColorInfoList()[pos] return (c[1] / 255.0, c[2] / 255.0, c[3] / 255.0) def rgb_component(srgb:int): '''srgb value to R,G,B components, e.g. 16711680 -> (255, 0, 0). Equal to PyMuPDF built-in method:: [int(255*x) for x in fitz.sRGB_to_pdf(x)] ''' # decimal to hex: 0x... s = hex(srgb)[2:].zfill(6) return [int(s[i:i+2], 16) for i in [0, 2, 4]] def rgb_to_value(rgb:list): '''RGB components to decimal value, e.g. (1,0,0) -> 16711680.''' res = 0 for (i,x) in enumerate(rgb): res += int(x*(16**2-1)) * 16**(4-2*i) return int(res) def cmyk_to_rgb(c:float, m:float, y:float, k:float, cmyk_scale:float=100): '''CMYK components to GRB value.''' r = (1.0 - c / float(cmyk_scale)) * (1.0 - k / float(cmyk_scale)) g = (1.0 - m / float(cmyk_scale)) * (1.0 - k / float(cmyk_scale)) b = (1.0 - y / float(cmyk_scale)) * (1.0 - k / float(cmyk_scale)) res = rgb_to_value([r, g, b]) # type: int return res def rgb_value(components:list): '''Gray/RGB/CMYK mode components to color value.''' num = len(components) # CMYK mode if num==4: c, m, y, k = map(float, components) color = cmyk_to_rgb(c, m, y, k, cmyk_scale=1.0) # RGB mode elif num==3: r, g, b = map(float, components) color = rgb_to_value([r, g, b]) # gray mode elif num==1: g = float(components[0]) color = rgb_to_value([g,g,g]) else: color = 0 return color # ------------------------- # pdf plot # ------------------------- def new_page(doc, width:float, height:float, title:str): '''Insert a new page with given title. Args: doc (fitz.Document): pdf document object. width (float): Page width. height (float): Page height. title (str): Page title shown in page. ''' # insert a new page page = doc.new_page(width=width, height=height) # plot title at the top-left corner gray = rgb_component_from_name('gray') page.insert_text((5, 16), title, color=gray, fontsize=15) return page def debug_plot(title:str, show=True): '''Plot the returned objects of inner function. Args: title (str): Page title. show (bool, optional): Don't plot if show==False. Default to True. .. note:: Prerequisite of the inner function: - the first argument is a :py:class:`~pdf2docx.page.BasePage` instance. - the last argument is configuration parameters in ``dict`` type. ''' def wrapper(func): def inner(*args, **kwargs): # execute function objects = func(*args, **kwargs) # check if plot page page = args[0] # BasePage object debug = kwargs.get('debug', False) doc = kwargs.get('debug_doc', None) filename = kwargs.get('debug_filename', None) if show and objects and debug and doc is not None: # create a new page debug_page = new_page(doc, page.width, page.height, title) # plot objects, e.g. text blocks, shapes, tables... objects.plot(debug_page) doc.save(filename) return objects return inner return wrapper pdf2docx-0.5.8/pdf2docx/converter.py000066400000000000000000000474631455352127700173600ustar00rootroot00000000000000'''PDF to Docx Converter.''' import json import logging import os from multiprocessing import Pool, cpu_count from time import perf_counter from typing import AnyStr, IO, Union import fitz from docx import Document from .page.Page import Page from .page.Pages import Pages # check PyMuPDF version # 1.19.0 <= v <= 1.23.8, or v>=1.23.16 v = list(map(int, fitz.VersionBind.split("."))) if v < [1,19,0] or [1,23,8]=1.23.16 is required for pdf2docx.") # logging logging.basicConfig( level=logging.INFO, format="[%(levelname)s] %(message)s") class Converter: '''The ``PDF`` to ``docx`` converter. * Read PDF file with ``PyMuPDF`` to get raw layout data page by page, including text, image, drawing and its properties, e.g. boundary box, font, size, image width, height. * Analyze layout in document level, e.g. page header, footer and margin. * Parse page layout to docx structure, e.g. paragraph and its properties like indentation, spacing, text alignment; table and its properties like border, shading, merging. * Finally, generate docx with ``python-docx``. ''' def __init__( self, pdf_file: str = None, password: str = None, stream: bytes = None ): '''Initialize fitz object with given pdf file path. Args: pdf_file (str): pdf file path. stream (bytes): pdf file in memory. password (str): Password for encrypted pdf. Default to None if not encrypted. ''' # fitz object self.filename_pdf = pdf_file self.password = str(password or "") if not pdf_file and not stream: raise ValueError("Either pdf_file or stream must be given.") if stream: self._fitz_doc = fitz.Document(stream=stream) else: self._fitz_doc = fitz.Document(pdf_file) # initialize empty pages container self._pages = Pages() @property def fitz_doc(self): return self._fitz_doc @property def pages(self): return self._pages def close(self): self._fitz_doc.close() @property def default_settings(self): '''Default parsing parameters.''' return { 'debug' : False, # plot layout if True 'ocr' : 0, # ocr status: 0 - no ocr; 1 - to do ocr; 2 - ocr-ed pdf 'ignore_page_error' : True, # not break the conversion process due to failure of a certain page if True 'multi_processing' : False, # convert pages with multi-processing if True 'cpu_count' : 0, # working cpu count when convert pages with multi-processing 'min_section_height' : 20.0, # The minimum height of a valid section. 'connected_border_tolerance' : 0.5, # two borders are intersected if the gap lower than this value 'max_border_width' : 6.0, # max border width 'min_border_clearance' : 2.0, # the minimum allowable clearance of two borders 'float_image_ignorable_gap' : 5.0, # float image if the intersection exceeds this value 'page_margin_factor_top' : 0.5, # [0,1] reduce top margin by factor 'page_margin_factor_bottom' : 0.5, # [0,1] reduce bottom margin by factor 'shape_min_dimension' : 2.0, # ignore shape if both width and height is lower than this value 'max_line_spacing_ratio' : 1.5, # maximum line spacing ratio: line spacing / line height 'line_overlap_threshold' : 0.9, # [0,1] delete line if the intersection to other lines exceeds this value 'line_break_width_ratio' : 0.5, # break line if the ratio of line width to entire layout bbox is lower than this value 'line_break_free_space_ratio' : 0.1, # break line if the ratio of free space to entire line exceeds this value 'line_separate_threshold' : 5.0, # two separate lines if the x-distance exceeds this value 'new_paragraph_free_space_ratio' : 0.85, # new paragraph if the ratio of free space to line height exceeds this value 'lines_left_aligned_threshold' : 1.0, # left aligned if d_x0 of two lines is lower than this value (Pt) 'lines_right_aligned_threshold' : 1.0, # right aligned if d_x1 of two lines is lower than this value (Pt) 'lines_center_aligned_threshold' : 2.0, # center aligned if delta center of two lines is lower than this value 'clip_image_res_ratio' : 4.0, # resolution ratio (to 72dpi) when clipping page image 'min_svg_gap_dx' : 15.0, # merge adjacent vector graphics if the horizontal gap is less than this value 'min_svg_gap_dy' : 2.0, # merge adjacent vector graphics if the vertical gap is less than this value 'min_svg_w' : 2.0, # ignore vector graphics if the bbox width is less than this value 'min_svg_h' : 2.0, # ignore vector graphics if the bbox height is less than this value 'extract_stream_table' : False, # don't consider stream table when extracting tables 'parse_lattice_table' : True, # whether parse lattice table or not; may destroy the layout if set False 'parse_stream_table' : True, # whether parse stream table or not; may destroy the layout if set False 'delete_end_line_hyphen' : False # delete hyphen at the end of a line } # ----------------------------------------------------------------------- # Parsing process: load -> analyze document -> parse pages -> make docx # ----------------------------------------------------------------------- def parse(self, start:int=0, end:int=None, pages:list=None, **kwargs): '''Parse pages in three steps: * open PDF file with ``PyMuPDF`` * analyze whole document, e.g. page section, header/footer and margin * parse specified pages, e.g. paragraph, image and table Args: start (int, optional): First page to process. Defaults to 0, the first page. end (int, optional): Last page to process. Defaults to None, the last page. pages (list, optional): Range of page indexes to parse. Defaults to None. kwargs (dict, optional): Configuration parameters. ''' return self.load_pages(start, end, pages) \ .parse_document(**kwargs) \ .parse_pages(**kwargs) def load_pages(self, start:int=0, end:int=None, pages:list=None): '''Step 1 of converting process: open PDF file with ``PyMuPDF``, especially for password encrypted file. Args: start (int, optional): First page to process. Defaults to 0, the first page. end (int, optional): Last page to process. Defaults to None, the last page. pages (list, optional): Range of page indexes to parse. Defaults to None. ''' logging.info(self._color_output('[1/4] Opening document...')) # encrypted pdf ? if self._fitz_doc.needs_pass: if not self.password: raise ConversionException(f'Require password for {self.filename_pdf}.') elif not self._fitz_doc.authenticate(self.password): raise ConversionException('Incorrect password.') # initialize empty pages num = len(self._fitz_doc) self._pages.reset([Page(id=i, skip_parsing=True) for i in range(num)]) # set pages to parse page_indexes = self._page_indexes(start, end, pages, num) for i in page_indexes: self._pages[i].skip_parsing = False return self def parse_document(self, **kwargs): '''Step 2 of converting process: analyze whole document, e.g. page section, header/footer and margin.''' logging.info(self._color_output('[2/4] Analyzing document...')) self._pages.parse(self.fitz_doc, **kwargs) return self def parse_pages(self, **kwargs): '''Step 3 of converting process: parse pages, e.g. paragraph, image and table.''' logging.info(self._color_output('[3/4] Parsing pages...')) pages = [page for page in self._pages if not page.skip_parsing] num_pages = len(pages) for i, page in enumerate(pages, start=1): pid = page.id + 1 logging.info('(%d/%d) Page %d', i, num_pages, pid) try: page.parse(**kwargs) except Exception as e: if not kwargs['debug'] and kwargs['ignore_page_error']: logging.error('Ignore page %d due to parsing page error: %s', pid, e) else: raise ConversionException(f'Error when parsing page {pid}: {e}') return self def make_docx(self, filename_or_stream=None, **kwargs): '''Step 4 of converting process: create docx file with converted pages. Args: filename_or_stream (str, file-like): docx file to write. kwargs (dict, optional): Configuration parameters. ''' logging.info(self._color_output('[4/4] Creating pages...')) # check parsed pages parsed_pages = list(filter( lambda page: page.finalized, self._pages )) if not parsed_pages: raise ConversionException('No parsed pages. Please parse page first.') if not filename_or_stream: if self.filename_pdf: filename_or_stream = f'{self.filename_pdf[0:-len(".pdf")]}.docx' # remove existing file if os.path.exists(filename_or_stream): os.remove(filename_or_stream) else: raise ConversionException("Please specify a docx file name or a file-like object to write.") # create page by page docx_file = Document() num_pages = len(parsed_pages) for i, page in enumerate(parsed_pages, start=1): if not page.finalized: continue # ignore unparsed pages pid = page.id + 1 logging.info('(%d/%d) Page %d', i, num_pages, pid) try: page.make_docx(docx_file) except Exception as e: if not kwargs['debug'] and kwargs['ignore_page_error']: logging.error('Ignore page %d due to making page error: %s', pid, e) else: raise MakedocxException(f'Error when make page {pid}: {e}') # save docx docx_file.save(filename_or_stream) # ----------------------------------------------------------------------- # Store / restore parsed results # ----------------------------------------------------------------------- def store(self): '''Store parsed pages in dict format.''' return { 'filename': os.path.basename(self.filename_pdf), 'page_cnt': len(self._pages), # count of all pages 'pages' : [page.store() for page in self._pages if page.finalized], # parsed pages only } def restore(self, data:dict): '''Restore pages from parsed results.''' # init empty pages if necessary if not self._pages: num = data.get('page_cnt', 100) self._pages.reset([Page(id=i, skip_parsing=True) for i in range(num)]) # restore pages for raw_page in data.get('pages', []): idx = raw_page.get('id', -1) self._pages[idx].restore(raw_page) def serialize(self, filename:str): '''Write parsed pages to specified JSON file.''' with open(filename, 'w', encoding='utf-8') as f: f.write(json.dumps(self.store(), indent=4)) def deserialize(self, filename:str): '''Load parsed pages from specified JSON file.''' with open(filename, 'r') as f: data = json.load(f) self.restore(data) # ----------------------------------------------------------------------- # high level methods, e.g. convert, extract table # ----------------------------------------------------------------------- def debug_page(self, i:int, docx_filename:str=None, debug_pdf:str=None, layout_file:str=None, **kwargs): '''Parse, create and plot single page for debug purpose. Args: i (int): Page index to convert. docx_filename (str): docx filename to write to. debug_pdf (str): New pdf file storing layout information. Default to add prefix ``debug_``. layout_file (str): New json file storing parsed layout data. Default to ``layout.json``. ''' # include debug information # fitz object in debug mode: plot page layout # file path for this debug pdf: demo.pdf -> debug_demo.pdf path, filename = os.path.split(self.filename_pdf) if not debug_pdf: debug_pdf = os.path.join(path, f'debug_{filename}') if not layout_file: layout_file = os.path.join(path, 'layout.json') kwargs.update({ 'debug' : True, 'debug_doc' : fitz.Document(), 'debug_filename': debug_pdf }) # parse and create docx self.convert(docx_filename, pages=[i], **kwargs) # layout information for debugging self.serialize(layout_file) def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0, end: int = None, pages: list = None, **kwargs): """Convert specified PDF pages to docx file. Args: docx_filename (str, file-like, optional): docx file to write. Defaults to None. start (int, optional): First page to process. Defaults to 0, the first page. end (int, optional): Last page to process. Defaults to None, the last page. pages (list, optional): Range of page indexes. Defaults to None. kwargs (dict, optional): Configuration parameters. Defaults to None. Refer to :py:meth:`~pdf2docx.converter.Converter.default_settings` for detail of configuration parameters. .. note:: Change extension from ``pdf`` to ``docx`` if ``docx_file`` is None. .. note:: * ``start`` and ``end`` is counted from zero if ``--zero_based_index=True`` (by default). * Start from the first page if ``start`` is omitted. * End with the last page if ``end`` is omitted. .. note:: ``pages`` has a higher priority than ``start`` and ``end``. ``start`` and ``end`` works only if ``pages`` is omitted. .. note:: Multi-processing works only for continuous pages specified by ``start`` and ``end`` only. """ t0 = perf_counter() logging.info('Start to convert %s', self.filename_pdf) settings = self.default_settings settings.update(kwargs) # input check if pages and settings['multi_processing']: raise ConversionException('Multi-processing works for continuous pages ' 'specified by "start" and "end" only.') # convert page by page if settings['multi_processing']: self._convert_with_multi_processing(docx_filename, start, end, **settings) else: self.parse(start, end, pages, **settings).make_docx(docx_filename, **settings) logging.info('Terminated in %.2fs.', perf_counter()-t0) def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs): '''Extract table contents from specified PDF pages. Args: start (int, optional): First page to process. Defaults to 0, the first page. end (int, optional): Last page to process. Defaults to None, the last page. pages (list, optional): Range of page indexes. Defaults to None. kwargs (dict, optional): Configuration parameters. Defaults to None. Returns: list: A list of parsed table content. ''' # parsing pages first settings = self.default_settings settings.update(kwargs) self.parse(start, end, pages, **settings) # get parsed tables tables = [] for page in self._pages: if page.finalized: tables.extend(page.extract_tables(**settings)) return tables def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs): '''Parse and create pages based on page indexes with multi-processing. Reference: https://pymupdf.readthedocs.io/en/latest/faq.html#multiprocessing ''' # make vectors of arguments for the processes cpu = min(kwargs['cpu_count'], cpu_count()) if kwargs['cpu_count'] else cpu_count() prefix = 'pages' # json file writing parsed pages per process vectors = [(i, cpu, start, end, self.filename_pdf, self.password, kwargs, f'{prefix}-{i}.json') for i in range(cpu)] # start parsing processes pool = Pool() pool.map(self._parse_pages_per_cpu, vectors, 1) # restore parsed page data for i in range(cpu): filename = f'{prefix}-{i}.json' if not os.path.exists(filename): continue self.deserialize(filename) os.remove(filename) # create docx file self.make_docx(docx_filename, **kwargs) @staticmethod def _parse_pages_per_cpu(vector): '''Render a page range of a document. Args: vector (list): A list containing required parameters. * 0 : segment number for current process * 1 : count of CPUs * 2,3: whole pages range to process * 4 : pdf filename * 5 : password for encrypted pdf * 6 : configuration parameters * 7 : json filename storing parsed results ''' # recreate the arguments idx, cpu, s, e, pdf_filename, password, kwargs, json_filename = vector # open pdf to get page count: all pages are marked to parse temporarily # since don't know which pages to parse for this moment cv = Converter(pdf_filename, password) cv.load_pages() # the specified pages to process e = e or len(cv.fitz_doc) all_indexes = range(s, e) num_pages = len(all_indexes) # page segment processed by this cpu m = int(num_pages/cpu) n = num_pages % cpu seg_size = m + int(idx Calibri.''' return name.split('+')[-1].split('-')[0] @staticmethod def _to_descriptor(name:str): '''Remove potential space, dash in font name, and turn to upper case.''' return name.replace(' ', '').replace('-', '').upper() @staticmethod def get_font_family_name(tt_font:TTFont): '''Get the font family name from the font's names table. https://gist.github.com/pklaus/dce37521579513c574d0 ''' name = family = '' FONT_SPECIFIER_NAME_ID = 4 FONT_SPECIFIER_FAMILY_ID = 1 for record in tt_font['name'].names: if b'\x00' in record.string: name_str = record.string.decode('utf-16-be') else: name_str = record.string.decode('latin-1') if record.nameID == FONT_SPECIFIER_NAME_ID and not name: name = name_str elif record.nameID == FONT_SPECIFIER_FAMILY_ID and not family: family = name_str if name and family: break # in case the font name is modified to pattern like BCDGEE+Calibri-Bold return Fonts._normalized_font_name(family) @staticmethod def get_line_height_factor(tt_font:TTFont): '''Calculate line height ratio based on ``hhea`` and ``OS/2`` tables. Fon non-CJK fonts:: f = (hhea.Ascent - hhea.Descent + hhea.LineGap) / units_per_em For non-CJK fonts (Windows):: f = (OS/2.winAscent + OS/2.winDescent + [External Leading]) / units_per_em External Leading = MAX(0, hhea.LineGap - ((OS/2.WinAscent + OS/2.winDescent) - (hhea.Ascent - hhea.Descent))) For CJK fonts:: f = 1.3 * (hhea.Ascent - hhea.Descent) / units_per_em Read more: * https://docs.microsoft.com/en-us/typography/opentype/spec/recom#baseline-to-baseline-distances * https://github.com/source-foundry/font-line#baseline-to-baseline-distance-calculations * https://www.zhihu.com/question/23349103 * https://github.com/source-foundry/font-line/blob/master/lib/fontline/metrics.py ''' units_per_em = tt_font["head"].unitsPerEm # hhea metrics hhea = tt_font["hhea"] hhea_ascent = hhea.ascent hhea_descent = hhea.descent hhea_linegap = hhea.lineGap hhea_total_height = hhea_ascent + abs(hhea_descent) hhea_btb_distance = hhea_total_height + hhea_linegap # depends on System if os.name=='nt': # OS/2 metrics os2 = tt_font["OS/2"] os2_win_ascent = os2.usWinAscent os2_win_descent = os2.usWinDescent os2_win_total_height = os2_win_ascent + os2_win_descent win_external_leading = max(0.0, hhea_linegap-(os2_win_total_height-hhea_total_height)) win_btb_distance = os2_win_total_height + win_external_leading btb_distance = win_btb_distance else: btb_distance = hhea_btb_distance # depends on CJK font or not cjk = Fonts.is_cjk_font(tt_font) distance = 1.3*hhea_total_height if cjk else 1.0*btb_distance return distance / units_per_em @staticmethod def is_cjk_font(tt_font:TTFont): '''Test font object to confirm that it meets our definition of a CJK font file. The definition is met if any of the following conditions are True: 1. The font has a CJK code page bit set in the OS/2 table 2. The font has a CJK Unicode range bit set in the OS/2 table 3. The font has any CJK Unicode code points defined in the cmap table https://github.com/googlefonts/fontbakery/blob/main/Lib/fontbakery/profiles/shared_conditions.py ''' os2 = tt_font["OS/2"] # OS/2 code page checks for _, bit in CJK_CODEPAGE_BITS.items(): if hasattr(os2, 'ulCodePageRange1') and os2.ulCodePageRange1 & (1 << bit): return True # OS/2 Unicode range checks for _, bit in CJK_UNICODE_RANGE_BITS.items(): if bit in range(0, 32): if hasattr(os2, 'ulCodePageRange1') and os2.ulUnicodeRange1 & (1 << bit): return True elif bit in range(32, 64): if hasattr(os2, 'ulCodePageRange2') and os2.ulUnicodeRange2 & (1 << (bit-32)): return True elif bit in range(64, 96): if hasattr(os2, 'ulCodePageRange3') and os2.ulUnicodeRange3 & (1 << (bit-64)): return True # defined CJK Unicode code point in cmap table checks try: cmap = tt_font.getBestCmap() except: return False if not cmap: return False for unicode_range in CJK_UNICODE_RANGES: for x in range(unicode_range[0], unicode_range[1]+1): if int(x) in cmap: return True # default, return False if the above checks did not identify a CJK font return False pdf2docx-0.5.8/pdf2docx/font/__init__.py000066400000000000000000000000001455352127700200260ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/gui/000077500000000000000000000000001455352127700155455ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/gui/App.py000066400000000000000000000017251455352127700166440ustar00rootroot00000000000000'''Main window for ``pdf2docx`` graphic user interface.''' import sys from tkinter import Tk, messagebox from .MainFrame import MainFrame class App(Tk): '''Simple graphic user interface.''' def __init__(self, title:str='App', width:int=300, height:int=200): '''Top app window.''' super().__init__() self.title(title) self.geometry(f'{width}x{height}') self.resizable(0, 0) # not allowed to change size # layout on the root window self.__create_widgets() # Quit window self.protocol("WM_DELETE_WINDOW", self._on_closing) def __create_widgets(self): self.widget = MainFrame(self) self.widget.grid(column=0, row=0) def _on_closing(self): if messagebox.askokcancel("Quit", "Do you want to quit?"): self.destroy() sys.exit(0) if __name__ == "__main__": app = App(title='PDF_2_Docx Converter', width=500, height=600) app.mainloop() pdf2docx-0.5.8/pdf2docx/gui/MainFrame.py000066400000000000000000000133501455352127700177600ustar00rootroot00000000000000'''Main frame with grouped controls.''' import os from tkinter import Tk, Frame, Label, Entry, Button, filedialog, messagebox import webbrowser from pdf2docx import Converter class MainFrame(Frame): def __init__(self, parent:Tk=None): '''Main frame with grouped controls. This should be attached to a parent ``Tk`` instance.''' super().__init__(parent) self.parent = parent self.config(bg='grey') self.grid(ipadx=100, ipady=100) self.setup_ui() # variables self.pdf_paths = set() # unique pdf files self.docx_folder = None def setup_ui(self): '''Layout of the user interface.''' # PDF to Docx Label self.program_use_label = Label(self, text='PDF to Docx', font='Impact 40', bg='white', fg='#1E90FF') self.program_use_label.grid(row=0, column=0, columnspan=2, padx=120, pady=50) # PDF file entry self.file_path_pdf_entry = Entry(self, border=5) self.file_path_pdf_entry.grid(row=1, column=0, sticky='w', ipady=4, padx=10) # Docx file entry self.file_path_docx_entry = Entry(self, border=5) self.file_path_docx_entry.grid(row=2, column=0, sticky='w', ipady=4, padx=10, pady=75) # Select PDF files Button self.select_pdf_file = Button(self, text='Select PDF files', fg='black', bg='white', border=3, command=self._callback_pdf_file_location) self.select_pdf_file.grid(row=1, column=1, sticky='w') # Select new files folder Button self.select_new_file_folder = Button(self, text='Select new files folder', fg='black', bg='white', border=3, command=self._callback_docx_folder_location) self.select_new_file_folder.grid(row=2, column=1, sticky='w') # Convert Button self.converter_button = Button(self, text='Convert', bg='#1E90FF', fg='white', font='impact 20', border=5, command=self._callback_convert) self.converter_button.grid(row=3, column=0, columnspan=2, ipady=5, ipadx=10) # *This converter can only convert text based pdf cant convert image based pdf Label self.check_label = Label(self, text='*This converter can only convert text based pdf \n cant convert image based pdf', bg='grey') self.check_label.grid(row=4, column=0, columnspan=2, pady=10) # how to check my file Button self.file_check_button = Button(self, text='how to check my file?', bg='grey', fg='black', command=lambda: webbrowser.open_new(r"https://support.policystat.com/hc/en-us/articles" r"/207993346-How-can-I-tell-if-my-PDFs-are-text-based-or-not-")) self.file_check_button.grid(row=5, column=0, columnspan=2) def _callback_pdf_file_location(self): '''Opens file explorer and let you select choose the pdf file that you want to convert.''' file_paths = filedialog.askopenfilenames(filetypes=[('PDF file', '*.pdf')]) self.pdf_paths = set() for path in file_paths: self.pdf_paths.add(path) # show just names names = ';'.join([f'"{os.path.basename(path)}"' for path in self.pdf_paths]) self.file_path_pdf_entry.delete(0, 'end') self.file_path_pdf_entry.insert(0, names) def _callback_docx_folder_location(self): '''Opens file explorer and let you choose the folder, that all the converted file or files going to saved.''' self.docx_folder = filedialog.askdirectory() self.file_path_docx_entry.delete(0, 'end') self.file_path_docx_entry.insert(0, self.docx_folder) def _callback_convert(self): '''Starts the convert of the file or files.''' # input check if not self.pdf_paths and not self.docx_folder: messagebox.showwarning( title='Neither files or folder selected', message='Select PDF file or files for convert ' 'and Select a folder for the converted files!') return if not self.pdf_paths: messagebox.showwarning( title='Not files for convert selected', message='Select PDF file or PDF files for convert!') return if not self.docx_folder: messagebox.showwarning( title='Not files folder selected', message='Select a folder for the converted files!') return # collect docx files to convert to docx_paths = [] for pdf_path in self.pdf_paths: base_name = os.path.basename(pdf_path) name, ext = os.path.splitext(base_name) docx_path = os.path.join(self.docx_folder, f'{name}.docx') docx_paths.append(docx_path) if any([os.path.exists(path) for path in docx_paths]) and \ not messagebox.askokcancel(title='Existed target file', message='Docx files with same target name are found under selected folder. ' 'Do you want to continue and replace them?'): return # now, do the converting work num_succ, num_fail = 0, 0 for pdf_path, docx_path in zip(self.pdf_paths, docx_paths): cv = Converter(pdf_path) try: cv.convert(docx_path) except Exception as e: print(e) num_fail += 1 else: num_succ += 1 finally: cv.close() messagebox.showinfo(title='Convert Done!', message=f'Successful ({num_succ}), Failed ({num_fail}).') pdf2docx-0.5.8/pdf2docx/gui/__init__.py000066400000000000000000000000001455352127700176440ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/image/000077500000000000000000000000001455352127700160435ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/image/Image.py000066400000000000000000000050441455352127700174420ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Image object. Data structure defined in link https://pymupdf.readthedocs.io/en/latest/textpage.html:: { 'type': 1, 'bbox': (x0,y0,x1,y1), 'width': w, 'height': h, 'image': b'', # --- discard properties --- 'ext': 'png', 'colorspace': n, 'xref': xref, 'yref': yref, 'bpc': bpc } ''' import base64 from io import BytesIO from ..common import docx from ..common.Element import Element class Image(Element): '''Base image object.''' def __init__(self, raw:dict=None): if raw is None: raw = {} self.width = raw.get('width', 0.0) self.height = raw.get('height', 0.0) # source image bytes # - image bytes passed from PyMuPDF -> use it directly # - base64 encoded string restored from json file -> encode to bytes and decode with base64 -> image bytes image = raw.get('image', b'') self.image = image if isinstance(image, bytes) else base64.b64decode(image.encode()) super().__init__(raw) @property def text(self): '''Get an image placeholder ````.''' return '' def from_image(self, image): '''Update with image block/span. Args: image (Image): Target image block/span. ''' self.width = image.width self.height = image.height self.image = image.image self.update_bbox(image.bbox) return self def store(self): '''Store image with base64 encode. * Encode image bytes with base64 -> base64 bytes * Decode base64 bytes -> str -> so can be serialized in json format ''' res = super().store() res.update({ 'width': self.width, 'height': self.height, 'image': base64.b64encode(self.image).decode() # serialize image with base64 }) return res def plot(self, page, color:tuple): '''Plot image bbox with diagonal lines (for debug purpose). Args: page (fitz.Page): Plotting page. ''' x0, y0, x1, y1 = self.bbox page.draw_line((x0, y0), (x1, y1), color=color, width=0.5) page.draw_line((x0, y1), (x1, y0), color=color, width=0.5) super().plot(page, stroke=color) def make_docx(self, paragraph): '''Add image span to a docx paragraph.''' # add image docx.add_image(paragraph, BytesIO(self.image), self.bbox.x1-self.bbox.x0, self.bbox.y1-self.bbox.y0)pdf2docx-0.5.8/pdf2docx/image/ImageBlock.py000066400000000000000000000037641455352127700204240ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Definition of Image block objects. **The raw image block will be merged into TextBlock > Line > Span.** ''' from io import BytesIO from ..text.Line import Line from ..text.TextBlock import TextBlock from .Image import Image from .ImageSpan import ImageSpan from ..common.Block import Block from ..common.docx import add_float_image class ImageBlock(Image, Block): '''Image block.''' def __init__(self, raw:dict=None): super().__init__(raw) # inline image type by default self.set_inline_image_block() def to_text_block(self): """Convert image block to a span under text block. Returns: TextBlock: New TextBlock instance containing this image. """ # image span span = ImageSpan().from_image(self) # add span to line image_line = Line() image_line.add(span) # insert line to block block = TextBlock() block.add(image_line) # NOTE: it's an image block even though in TextBlock type block.set_inline_image_block() return block def store(self): '''Store ImageBlock instance in raw dict.''' res = Block.store(self) res.update( Image.store(self) ) return res def plot(self, page): '''Plot image bbox with diagonal lines (for debug purpose). Args: page (fitz.Page): pdf page to plot. ''' super().plot(page, color=(1,0,0)) def make_docx(self, p): '''Create floating image behind text. Args: p (Paragraph): ``python-docx`` paragraph instance. .. note:: Inline image is created within TextBlock. ''' if self.is_float_image_block: x0, y0, x1, y1 = self.bbox add_float_image(p, BytesIO(self.image), width=x1-x0, pos_x=x0, pos_y=y0) else: super().make_docx(p) return ppdf2docx-0.5.8/pdf2docx/image/ImageSpan.py000066400000000000000000000013411455352127700202600ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Image span based on same raw data structure with Image block. ''' from ..common import constants from .Image import Image class ImageSpan(Image): '''Image span.''' def intersects(self, rect): '''Create new ImageSpan object with image contained in given bbox. Args: rect (fitz.Rect): Target bbox. Returns: ImageSpan: A copy of itself if intersects with target; otherwise empty ImageSpan. ''' # add image span if most of of the image is contained in bbox if self.get_main_bbox(rect, constants.FACTOR_MAJOR): return self.copy() # otherwise, ignore image return ImageSpan()pdf2docx-0.5.8/pdf2docx/image/ImagesExtractor.py000066400000000000000000000335261455352127700215270ustar00rootroot00000000000000'''Extract images from PDF. Both raster images and vector graphics are considered: * Normal images like jpeg or png could be extracted with method ``page.get_text('rawdict')`` and ``Page.get_images()``. Note the process for png images with alpha channel. * Vector graphics are actually composed of a group of paths, represented by operators like ``re``, ``m``, ``l`` and ``c``. They're detected by finding the contours with ``opencv``. ''' import logging import fitz from ..common.Collection import Collection from ..common.share import BlockType from ..common.algorithm import (recursive_xy_cut, inner_contours, xy_project_profile) class ImagesExtractor: def __init__(self, page:fitz.Page) -> None: '''Extract images from PDF page. Args: page (fitz.Page): pdf page to extract images. ''' self._page = page def clip_page_to_pixmap(self, bbox:fitz.Rect=None, zoom:float=3.0): '''Clip page pixmap (without text) according to ``bbox``. Args: bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on the final page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: fitz.Pixmap: The extracted pixmap. ''' # hide text self._hide_page_text(self._page) if bbox is None: clip_bbox = self._page.rect # transform to the final bbox when page is rotated elif self._page.rotation: clip_bbox = bbox * self._page.rotation_matrix else: clip_bbox = bbox clip_bbox = clip_bbox & self._page.rect # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 matrix = fitz.Matrix(zoom, zoom) return self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap def clip_page_to_dict(self, bbox:fitz.Rect=None, clip_image_res_ratio:float=3.0): '''Clip page pixmap (without text) according to ``bbox`` and convert to source image. Args: bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. Returns: list: A list of image raw dict. ''' pix = self.clip_page_to_pixmap(bbox=bbox, zoom=clip_image_res_ratio) return self._to_raw_dict(pix, bbox) def extract_images(self, clip_image_res_ratio:float=3.0): '''Extract normal images with ``Page.get_images()``. Args: clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. Returns: list: A list of extracted and recovered image raw dict. .. note:: ``Page.get_images()`` contains each image only once, which may less than the real count of images in a page. ''' # pdf document doc = self._page.parent rotation = self._page.rotation # The final view might be formed by several images with alpha channel only, as shown in issue-123. # It's still inconvenient to extract the original alpha/mask image, as a compromise, extract the # equivalent image by clipping the union page region for now. # https://github.com/dothinking/pdf2docx/issues/123 # step 1: collect images: [(bbox, item), ..., ] ic = Collection() for item in self._page.get_images(full=True): item = list(item) item[-1] = 0 # find all occurrences referenced to this image rects = self._page.get_image_rects(item) unrotated_page_bbox = self._page.cropbox # note the difference to page.rect for bbox in rects: # ignore small images if bbox.get_area()<=4: continue # ignore images outside page if not unrotated_page_bbox.intersects(bbox): continue # collect images ic.append((bbox, item)) # step 2: group by intersection fun = lambda a, b: a[0].intersects(b[0]) groups = ic.group(fun) # step 3: check each group images = [] for group in groups: # clip page with the union bbox of all intersected images if len(group) > 1: clip_bbox = fitz.Rect() for (bbox, item) in group: clip_bbox |= bbox raw_dict = self.clip_page_to_dict(clip_bbox, clip_image_res_ratio) else: bbox, item = group[0] # Regarding images consist of alpha values only, the turquoise color shown in # the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 # It's not safe to identify images with alpha values only, # - colorspace is None, for pymupdf <= 1.23.8 # - colorspace is always Colorspace(CS_RGB), for pymupdf==1.23.9-15 -> issue # - colorspace is Colorspace(CS_), for pymupdf >= 1.23.16 # So, use extracted image info directly. # image item: (xref, smask, width, height, bpc, colorspace, ...), e.g., # (19, 0, 331, 369, 1, '', '', 'Im1', 'FlateDecode', 0) # (20, 24, 1265, 1303, 8, 'DeviceRGB', '', 'Im2', 'FlateDecode', 0) # (21, 0, 331, 369, 1, '', '', 'Im3', 'CCITTFaxDecode', 0) # (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0) # (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0) if item[5]=='': raw_dict = self.clip_page_to_dict(bbox, clip_image_res_ratio) # normal images else: # recover image, e.g., handle image with mask, or CMYK color space pix = self._recover_pixmap(doc, item) # rotate image with opencv if page is rotated raw_dict = self._to_raw_dict(pix, bbox) if rotation: raw_dict['image'] = self._rotate_image(pix, -rotation) images.append(raw_dict) return images def detect_svg_contours(self, min_svg_gap_dx:float, min_svg_gap_dy:float, min_w:float, min_h:float): '''Find contour of potential vector graphics. Args: min_svg_gap_dx (float): Merge svg if the horizontal gap is less than this value. min_svg_gap_dy (float): Merge svg if the vertical gap is less than this value. min_w (float): Ignore contours if the bbox width is less than this value. min_h (float): Ignore contours if the bbox height is less than this value. Returns: list: A list of potential svg region: (external_bbox, inner_bboxes:list). ''' import cv2 as cv # clip page and convert to opencv image pixmap = self.clip_page_to_pixmap(zoom=1.0) src = self._pixmap_to_cv_image(pixmap) # gray and binary gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY) _, binary = cv.threshold(gray, 253, 255, cv.THRESH_BINARY_INV) # external bbox: split images with recursive xy cut external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy) # inner contours grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h) for bbox in external_bboxes] # combined external and inner contours groups = list(zip(external_bboxes, grouped_inner_bboxes)) # plot detected images for debug debug = False if debug: # plot projection profile for each sub-image for i, (x0, y0, x1, y1) in enumerate(external_bboxes): arr = xy_project_profile(src[y0:y1, x0:x1, :], binary[y0:y1, x0:x1]) cv.imshow(f'sub-image-{i}', arr) for bbox, inner_bboxes in groups: # plot external bbox x0, y0, x1, y1 = bbox cv.rectangle(src, (x0, y0), (x1, y1), (255,0,0), 1) # plot inner bbox for u0, v0, u1, v1 in inner_bboxes: cv.rectangle(src, (u0, v0), (u1, v1), (0,0,255), 1) cv.imshow("img", src) cv.waitKey(0) return groups @staticmethod def _to_raw_dict(image:fitz.Pixmap, bbox:fitz.Rect): '''Store Pixmap ``image`` to raw dict. Args: image (fitz.Pixmap): Pixmap to store. bbox (fitz.Rect): Boundary box the pixmap. Returns: dict: Raw dict of the pixmap. ''' return { 'type': BlockType.IMAGE.value, 'bbox': tuple(bbox), 'width': image.width, 'height': image.height, 'image': image.tobytes() } @staticmethod def _rotate_image(pixmap:fitz.Pixmap, rotation:int): '''Rotate image represented by image bytes. Args: pixmap (fitz.Pixmap): Image to rotate. rotation (int): Rotation angle. Return: image bytes. ''' import cv2 as cv import numpy as np # convert to opencv image img = ImagesExtractor._pixmap_to_cv_image(pixmap) h, w = img.shape[:2] # get image height, width # calculate the center of the image x0, y0 = w//2, h//2 # default scale value for now -> might be extracted from PDF page property scale = 1.0 # rotation matrix matrix = cv.getRotationMatrix2D((x0, y0), rotation, scale) # calculate the final dimension cos = np.abs(matrix[0, 0]) sin = np.abs(matrix[0, 1]) # compute the new bounding dimensions of the image W = int((h * sin) + (w * cos)) H = int((h * cos) + (w * sin)) # adjust the rotation matrix to take into account translation matrix[0, 2] += (W / 2) - x0 matrix[1, 2] += (H / 2) - y0 # perform the rotation holding at the center rotated_img = cv.warpAffine(img, matrix, (W, H)) # convert back to bytes _, im_png = cv.imencode('.png', rotated_img) return im_png.tobytes() @staticmethod def _hide_page_text(page:fitz.Page): '''Hide page text before clipping page.''' # NOTE: text might exist in both content stream and form object stream # - content stream, i.e. direct page content # - form object, i.e. contents referenced by this page xref_list = [xref for (xref, name, invoker, bbox) in page.get_xobjects()] xref_list.extend(page.get_contents()) # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent # type: fitz.Document for xref in xref_list: stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.update_stream(xref, stream) @staticmethod def _recover_pixmap(doc:fitz.Document, item:list): """Restore pixmap with soft mask considered. References: * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList * https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-handle-stencil-masks * https://github.com/pymupdf/PyMuPDF/issues/670 Args: doc (fitz.Document): pdf document. item (list): image instance of ``page.get_images()``. Returns: fitz.Pixmap: Recovered pixmap with soft mask considered. """ # data structure of `item`: # (xref, smask, width, height, bpc, colorspace, ...) x = item[0] # xref of PDF image s = item[1] # xref of its /SMask # base image pix = fitz.Pixmap(doc, x) # reconstruct the alpha channel with the smask if exists if s > 0: mask = fitz.Pixmap(doc, s) if pix.alpha: temp = fitz.Pixmap(pix, 0) # make temp pixmap w/o the alpha pix = None # release storage pix = temp # check dimension if pix.width==mask.width and pix.height==mask.height: pix = fitz.Pixmap(pix, mask) # now compose final pixmap else: logging.warning('Ignore image due to inconsistent size of color and mask pixmaps: %s', item) # we may need to adjust something for CMYK pixmaps here -> # recreate pixmap in RGB color space if necessary # NOTE: pix.colorspace may be None for images with alpha channel values only if 'CMYK' in item[5].upper(): pix = fitz.Pixmap(fitz.csRGB, pix) return pix @staticmethod def _pixmap_to_cv_image(pixmap:fitz.Pixmap): '''Convert fitz Pixmap to opencv image. Args: pixmap (fitz.Pixmap): PyMuPDF Pixmap. ''' import cv2 as cv import numpy as np img_byte = pixmap.tobytes() return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)pdf2docx-0.5.8/pdf2docx/image/__init__.py000066400000000000000000000000001455352127700201420ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/layout/000077500000000000000000000000001455352127700162765ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/layout/Blocks.py000066400000000000000000000626721455352127700201020ustar00rootroot00000000000000''' A group of text elements, distinguished to ``Shape`` elements. For instance, ``TextBlock``, ``ImageBlock`` or ``TableBlock`` after parsing, while ``Line`` instances at the beginning, and a combination of ``Line`` and ``TableBlock`` during parsing process. ''' import logging from docx.shared import Pt from ..common import constants from ..common.Collection import ElementCollection from ..common.share import (BlockType, lower_round, rgb_value) from ..common.Block import Block from ..common.docx import (reset_paragraph_format, delete_paragraph) from ..text.TextBlock import TextBlock from ..text.TextSpan import TextSpan from ..text.Line import Line from ..text.Lines import Lines from ..table.Cell import Cell from ..image.ImageBlock import ImageBlock from ..table.TableBlock import TableBlock class Blocks(ElementCollection): '''Block collections.''' def __init__(self, instances:list=None, parent=None): ''' A collection of text based elements, e.g. lines, images or blocks.''' super().__init__(instances, parent) self._floating_image_blocks = [] def _update_bbox(self, block:Block): '''Override. The parent is ``Layout``, which is not necessary to update its bbox. So, do nothing but required here. ''' pass @property def floating_image_blocks(self): return self._floating_image_blocks @property def lattice_table_blocks(self): '''Get lattice table blocks contained in this Collection.''' return list(filter( lambda block: isinstance(block, Block) and block.is_lattice_table_block, self._instances)) @property def stream_table_blocks(self): '''Get stream table blocks contained in this Collection.''' return list(filter( lambda block: isinstance(block, Block) and block.is_stream_table_block, self._instances)) @property def table_blocks(self): '''Get table blocks contained in this Collection.''' return list(filter( lambda block: isinstance(block, Block) and block.is_table_block, self._instances)) @property def inline_image_blocks(self): '''Get inline image blocks contained in this Collection.''' return list(filter( lambda block: isinstance(block, Block) and block.is_inline_image_block, self._instances)) @property def text_blocks(self): '''Get text/image blocks contained in this Collection.''' return list(filter( lambda block: isinstance(block, Block) and block.is_text_image_block, self._instances)) def restore(self, raws:list): '''Clean current instances and restore them from source dict. ImageBlock is converted to ImageSpan contained in TextBlock. Args: raws (list): A list of raw dicts representing text/image/table blocks. Returns: Blocks: self ''' self.reset() # clean current instances for raw_block in raws: block_type = raw_block.get('type', -1) # type: int # inline image block -> text block if block_type == BlockType.IMAGE.value: block = ImageBlock(raw_block).to_text_block() # text block elif block_type == BlockType.TEXT.value: block = TextBlock(raw_block) # table block elif block_type in (BlockType.LATTICE_TABLE.value, BlockType.STREAM_TABLE.value): block = TableBlock(raw_block) else: block = None # add to list self.append(block) return self def clean_up(self, float_image_ignorable_gap:float, line_overlap_threshold:float): '''Clean up blocks in page level. * convert to lines * remove lines out of page * remove transformed text: text direction is not (1, 0) or (0, -1) * remove empty lines Args: float_image_ignorable_gap (float): Regarded as float image if the intersection exceeds this value. line_overlap_threshold (float): remove line if the intersection exceeds this value. .. note:: The block structure extracted from ``PyMuPDF`` might be unreasonable, e.g. * one real paragraph is split into multiple blocks; or * one block consists of multiple real paragraphs ''' if not self._instances: return # convert to lines instances = [] for block in self._instances: if not isinstance(block, (ImageBlock, TextBlock)): continue instances.extend(block.lines) # delete invalid lines page_bbox = self.parent.working_bbox f = lambda line: line.bbox.intersects(page_bbox) and \ not line.white_space_only and ( line.is_horizontal_text or line.is_vertical_text) instances = list(filter(f, instances)) # delete redundant blanks for line in instances: line.strip() # detect floating images self.reset(instances) \ ._identify_floating_images(float_image_ignorable_gap) \ ._remove_overlapped_lines(line_overlap_threshold) def assign_to_tables(self, tables:list): '''Add blocks (line or sub-table) to associated cells of given tables. Args: tables (list): A list of TableBlock instances. ''' if not tables: return # assign blocks to table region blocks_in_tables = [[] for _ in tables] # type: list[list[Line|TableBlock]] blocks = [] # type: list[Line|TableBlock] for block in self._instances: self._assign_block_to_tables(block, tables, blocks_in_tables, blocks) # assign blocks to associated cells for table, blocks_in_table in zip(tables, blocks_in_tables): # no contents for this table if not blocks_in_table: continue table.assign_blocks(blocks_in_table) # sort in natural reading order and update layout blocks blocks.extend(tables) self.reset(blocks) def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float): '''Collect elements in Line level (line or table bbox), which may contained in a stream table region. Table may exist on the following conditions: * blocks in a row don't follow flow layout; or * block is contained in potential shading Args: potential_shadings (list): a group of shapes representing potential cell shading line_separate_threshold (float): two separate lines if the x-distance exceeds this value Returns: list: A list of Lines. Each group of Lines represents a potential table. .. note:: ``PyMuPDF`` may group multi-lines in a row as a text block while each line belongs to different cell. So, it's required to deep into line level. ''' # group lines by row: any intersection would be counted as same group # NOTE: consider real text direction when all lines have same orientation, i.e. either horizontal or # vertical; otherwise, consider the default direction, i.e. horizontal. rows = self.group_by_rows(text_direction=not self.is_mix_text) # get sub-lines from block: line or table block def sub_line(block): return block if isinstance(block, Line) else Line().update_bbox(block.outer_bbox) # exclude potential shading in white bg-color shadings_exclude_white = list(filter( lambda shape: not shape.is_determined and shape.color!=rgb_value([1,1,1]), potential_shadings)) def contained_in_shadings(block): for shading in shadings_exclude_white: if shading.contains(block, threshold=constants.FACTOR_MOST): return True return False # store text lines in a potential table res = [] # type: list[Lines] table_lines = [] # type: list[Line] def close_table(): if not table_lines: return res.append(Lines(table_lines)) table_lines.clear() # check row by row ref_pos = rows[0].bbox.y1 cell_layout = isinstance(self.parent, Cell) for row in rows: bbox = row.bbox # flow layout or not? if not row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout): table_lines.extend([sub_line(block) for block in row]) else: close_table() # contained in shading or not? for block in row: if contained_in_shadings(block): table_lines.append(sub_line(block)) # close table if significant vertical distance if bbox.y0-ref_pos>=50: close_table() # update reference pos ref_pos = bbox.y1 # don't forget last table close_table() return res def parse_block(self, max_line_spacing_ratio:float, line_break_free_space_ratio:float, new_paragraph_free_space_ratio:float): '''Group lines into text block.''' # sort in normal reading order self.sort_in_reading_order_plus() # join lines with similar properties, e.g. spacing, together into text block blocks = self._join_lines_vertically(max_line_spacing_ratio) # split text block by checking text blocks = self._split_text_block_vertically(blocks, line_break_free_space_ratio, new_paragraph_free_space_ratio) self.reset(blocks) def parse_text_format(self, rects, delete_end_line_hyphen:bool): '''Parse text format with style represented by stroke/fill shapes. Args: rects (Shapes): Potential styles applied on blocks. delete_end_line_hyphen (bool): delete hyphen at the end of a line if True. ''' # parse text block style one by one for block in filter(lambda e: e.is_text_block, self._instances): block.parse_text_format(rects) # adjust word at the end of each line block.lines.adjust_last_word(delete_end_line_hyphen) def parse_spacing(self, *args): '''Calculate external and internal space for text blocks: - vertical distance between blocks, i.e. paragraph before/after spacing - horizontal distance to left/right border, i.e. paragraph left/right indent - vertical distance between lines, i.e. paragraph line spacing ''' if not self._instances: return self._parse_block_horizontal_spacing(*args) self._parse_block_vertical_spacing() self._parse_line_spacing() def make_docx(self, doc): '''Create page based on parsed block structure. Args: doc (Document, _Cell): The container to make docx content. ''' def make_table(table_block, pre_table): # create dummy paragraph if table before space is set # - a minimum line height of paragraph is 0.7pt, so ignore before space if less than this value # - but tow adjacent tables will be combined automatically, so adding a minimum dummy paragraph is required if table_block.before_space>=constants.MIN_LINE_SPACING or pre_table: h = lower_round(table_block.before_space, 1) # round(x,1), but to lower bound p = doc.add_paragraph() reset_paragraph_format(p, line_spacing=Pt(h)) # new table table = doc.add_table(rows=table_block.num_rows, cols=table_block.num_cols) table.autofit = False table.allow_autofit = False table_block.make_docx(table) pre_table = False cell_layout = isinstance(self.parent, Cell) for block in self._instances: # make paragraphs if block.is_text_image_block: # new paragraph p = doc.add_paragraph() block.make_docx(p) pre_table = False # mark block type # make table elif block.is_table_block: make_table(block, pre_table) pre_table = True # mark block type # NOTE: within a cell, there is always an empty paragraph after table, # so, delete it right here. # https://github.com/dothinking/pdf2docx/issues/76 if cell_layout: delete_paragraph(doc.paragraphs[-1]) # NOTE: If a table is at the end of a page, a new paragraph will be automatically # added by the rending engine, e.g. MS Word, which resulting in an unexpected # page break. The solution is to never put a table at the end of a page, so add # an empty paragraph and reset its format, particularly line spacing, when a table # is created. for block in self._instances[::-1]: # ignore float image block if block.is_float_image_block: continue # nothing to do if not end with table block if not block.is_table_block: break # otherwise, add a small paragraph p = doc.add_paragraph() reset_paragraph_format(p, Pt(constants.MIN_LINE_SPACING)) # a small line height def plot(self, page): '''Plot blocks in PDF page for debug purpose.''' for block in self._instances: block.plot(page) # ---------------------------------------------------------------------------------- # internal methods # ---------------------------------------------------------------------------------- def _identify_floating_images(self, float_image_ignorable_gap:float): '''Identify floating image lines and convert to ImageBlock.''' # group lines by connectivity groups = self.group_by_connectivity(dx=-float_image_ignorable_gap, dy=-float_image_ignorable_gap) # identify floating images for group in filter(lambda group: len(group)>1, groups): for line in filter(lambda line: line.image_spans, group): float_image = ImageBlock().from_image(line.spans[0]) float_image.set_float_image_block() self._floating_image_blocks.append(float_image) # remove the original image line from flow layout by setting empty bbox line.update_bbox((0,0,0,0)) return self def _remove_overlapped_lines(self, line_overlap_threshold:float): '''Delete overlapped lines. NOTE: Don't run this method until floating images are excluded. ''' # group lines by overlap fun = lambda a, b: a.get_main_bbox(b, threshold=line_overlap_threshold) groups = self.group(fun) # delete overlapped lines for group in filter(lambda group: len(group)>1, groups): # keep only the line with largest area sorted_lines = sorted(group, key=lambda line: line.bbox.get_area()) for line in sorted_lines[:-1]: logging.warning('Ignore Line "%s" due to overlap', line.text) line.update_bbox((0,0,0,0)) return self @staticmethod def _assign_block_to_tables(block, tables:list, blocks_in_tables:list, blocks:list): '''Assign block (line or table block) to contained table region ``blocks_in_tables``, or out-of-table ``blocks``.''' for table, blocks_in_table in zip(tables, blocks_in_tables): # fully contained in a certain table with margin if table.contains(block, threshold=constants.FACTOR_MAJOR): blocks_in_table.append(block) break # not possible in current table, then check next table elif not table.bbox.intersects(block.bbox): continue # Now, this block is out of all table regions else: blocks.append(block) def _join_lines_vertically(self, max_line_spacing_ratio:float): '''Create text blocks by merge lines with same properties (spacing, font, size) in vertical direction. At this moment, the block instance is either Line or TableBlock. ''' idx0, idx1 = (1, 3) if self.is_horizontal_text else (0, 2) def get_v_bdy(block): '''Coordinates of block top and bottom boundaries.''' bbox = block.bbox return bbox[idx0], bbox[idx1] def vertical_distance(block1, block2): u0, u1 = get_v_bdy(block1) v0, v1 = get_v_bdy(block2) return round(v0-u1, 2) def line_height(line:Line): '''The height of line span with most characters.''' span = max(line.spans, key=lambda s: len(s.text)) h = span.bbox.height if line.is_horizontal_text else span.bbox.width return round(h, 2) def common_vertical_spacing(): '''Vertical distance with most frequency: a reference of line spacing.''' ref0, ref1 = get_v_bdy(self._instances[0]) distances = [] for block in self._instances[1:]: y0, y1 = get_v_bdy(block) distances.append(round(y0-ref1, 2)) ref0, ref1 = y0, y1 return max(distances, key=distances.count) if distances else 0.0 # create text block based on lines blocks = [] # type: list[TextBlock] lines = [] # type: list[Line] def close_text_block(): if not lines: return block = TextBlock() block.add(lines) blocks.append(block) lines.clear() # check line by line ref_dis = common_vertical_spacing() for block in self._instances: # if current is a table block: # - finish previous text block; and # - add this table block directly if isinstance(block, TableBlock): close_text_block() blocks.append(block) # check two adjacent text lines else: ref_line = lines[-1] if lines else None # first line or in same row with previous line: needn't to create new text block if not ref_line or ref_line.in_same_row(block): start_new_block = False # image line: create new text block elif block.image_spans or ref_line.image_spans: start_new_block = True # lower than common line spacing: needn't to create new text block elif vertical_distance(ref_line, block)<=ref_dis+1.0 and \ ref_dis<=max_line_spacing_ratio*line_height(ref_line): start_new_block = False else: start_new_block = True if start_new_block: close_text_block() lines.append(block) # don't forget last group close_text_block() return blocks @staticmethod def _split_text_block_vertically(instances:list, line_break_free_space_ratio:float, new_paragraph_free_space_ratio:float): '''Split text block into separate paragraph based on punctuation of sentence. .. note:: Considered only normal reading direction, from left to right, from top to bottom. ''' blocks = [] # type: list[TextBlock] for block in instances: # add block if this isn't a text block if not block.is_text_block: blocks.append(block) continue # add split blocks if necessary lines_list = block.lines.split_vertically_by_text(line_break_free_space_ratio, new_paragraph_free_space_ratio) if len(lines_list)==1: blocks.append(block) else: for lines in lines_list: text_block = TextBlock() text_block.add(lines) blocks.append(text_block) return blocks def _parse_block_horizontal_spacing(self, *args): '''Calculate external horizontal space for text blocks, i.e. alignment mode and left spacing for paragraph in docx: - horizontal block -> take left boundary as reference - vertical block -> take bottom boundary as reference ''' # bbox of blocks # - page level, e.g. blocks in top layout # - table level, e.g. blocks in table cell bbox = self.parent.working_bbox for block in self._instances: block.parse_horizontal_spacing(bbox, *args) def _parse_block_vertical_spacing(self): '''Calculate external vertical space for text blocks, i.e. before/after space in docx. The vertical spacing is determined by the vertical distance to previous block. For the first block, the reference position is top margin. It's easy to set before-space or after-space for a paragraph with ``python-docx``, so, if current block is a paragraph, set before-space for it; if current block is not a paragraph, e.g. a table, set after-space for previous block (generally, previous block should be a paragraph). ''' # bbox of blocks # - page level, e.g. blocks in top layout # - table level, e.g. blocks in table cell bbox = self.parent.working_bbox # check text direction for vertical space calculation: # - normal reading direction (from left to right) -> the reference boundary is top border, i.e. bbox[1]. # - vertical text direction, e.g. from bottom to top -> left border bbox[0] is the reference idx = 1 if self.is_horizontal_text else 0 ref_block = self._instances[0] ref_pos = bbox[idx] for block in self._instances: # NOTE: the table bbox is counted on center-line of outer borders, so a half of top border # size should be excluded from the calculated vertical spacing if block.is_table_block: dw = block[0][0].border_width[0] / 2.0 # use top border of the first cell else: dw = 0.0 start_pos = block.bbox[idx] - dw para_space = start_pos-ref_pos # modify vertical space in case the block is out of bottom boundary dy = max(block.bbox[idx+2]-bbox[idx+2], 0.0) para_space -= dy para_space = max(para_space, 0.0) # ignore negative value # ref to current (paragraph): set before-space for paragraph if block.is_text_block: # spacing before this paragraph block.before_space = para_space # if ref to current (image): set before-space for paragraph elif block.is_inline_image_block: block.before_space = para_space # ref (paragraph/image) to current: set after-space for ref paragraph elif ref_block.is_text_block or ref_block.is_inline_image_block: ref_block.after_space = para_space # situation with very low probability, e.g. ref (table) to current (table) # we can't set before space for table in docx, but the tricky way is to # create an empty paragraph and set paragraph line spacing and before space else: # let para_space>=1 Pt to accommodate the dummy paragraph if not the first block block.before_space = max(para_space, int(block!=self._instances[0])*constants.MINOR_DIST) # update reference block ref_block = block ref_pos = ref_block.bbox[idx+2] + dw # assume same bottom border with top one # NOTE: when a table is at the end of a page, a dummy paragraph with a small line spacing # is added after this table, to avoid unexpected page break. Accordingly, this extra spacing # must be remove in other place, especially the page space is run out. # Here, reduce the last row of table. block = self._instances[-1] if block.is_table_block: block[-1].height -= constants.MINOR_DIST def _parse_line_spacing(self): '''Calculate internal vertical space for text blocks, i.e. paragraph line spacing in docx. .. note:: Run parsing block vertical spacing in advance. ''' def is_exact_line_spacing(block): '''check line spacing type based on parsed font metrics of text span: exact line spacing if no standard line height is extracted ''' for line in block.lines: absent_line_heights = list(not span.is_valid_line_height \ for span in line.spans if isinstance(span, TextSpan)) if any(absent_line_heights): return True return False for block in self._instances: if not block.is_text_block: continue if is_exact_line_spacing(block): block.line_space_type = 0 block.parse_exact_line_spacing() else: block.parse_relative_line_spacing() pdf2docx-0.5.8/pdf2docx/layout/Column.py000066400000000000000000000022311455352127700201030ustar00rootroot00000000000000'''Column of Section. In most cases, one section per page. But in case multi-columns page, sections are used to distinguish these different layouts. .. note:: Currently, support at most two columns. :: { 'bbox': (x0, y0, x1, y1), 'blocks': [{ ... # block instances }, ...], 'shapes': [{ ... # shape instances }, ...] } ''' from ..common.Collection import Collection from ..layout.Layout import Layout from ..shape.Shape import Shape from ..text.Line import Line class Column(Layout): '''Column of Section.''' @property def working_bbox(self): return self.bbox def add_elements(self, elements:Collection): '''Add candidate elements, i.e. lines or shapes, to current column.''' blocks = [e for e in elements if isinstance(e, Line)] shapes = [e for e in elements if isinstance(e, Shape)] self.assign_blocks(blocks) self.assign_shapes(shapes) def make_docx(self, doc): '''Create Section Column in docx. Args: doc (Document): ``python-docx`` document object ''' self.blocks.make_docx(doc) pdf2docx-0.5.8/pdf2docx/layout/Layout.py000066400000000000000000000145131455352127700201310ustar00rootroot00000000000000'''Document layout depends on Blocks and Shapes. **Layout** here refers to the content and position of text, image and table. The target is to convert source blocks and shapes to a *flow layout* that can be re-created as docx elements like paragraph and table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout . So, detecting and parsing table block is the principle steps. The prerequisite work is done before this step: 1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level, because the block structure determined by ``PyMuPDF`` might be not reasonable. #. Parse structure in document level, e.g. page header/footer. #. Parse Section and Column layout in Page level. The page layout parsing idea: 1. Parse table layout in Column level. (a) Detect explicit tables first based on shapes. (#) Then, detect stream tables based on original text blocks and parsed explicit tables. (#) Move table contained blocks (lines or explicit table) to associated cell-layout. #. Parse paragraph in Column level. (a) Detect text blocks by combining related lines. (#) Parse paragraph style, e.g. text format, alignment #. Calculate vertical spacing based on parsed tables and paragraphs. #. Repeat above steps for cell-layout in parsed table level. ''' from abc import (ABC, abstractmethod) from ..text.Line import Line from ..common import constants from ..common.Element import Element from ..shape.Shapes import Shapes class Layout(Element, ABC): '''Blocks and shapes structure and formats.''' def __init__(self, bbox=None): ''' Initialize layout. Note that layout bbox must be set explicitly, rather than calculated automatically from contained blocks and shapes.''' from .Blocks import Blocks # avoid import conflicts from ..table.TablesConstructor import TablesConstructor raw = {'bbox': bbox} if bbox else {} super().__init__(raw) # Blocks representing text/table contents. self.blocks = Blocks(parent=self) # Shapes representing table border, shading and text style like underline, highlight. self.shapes = Shapes(parent=self) # table builder self._table_parser = TablesConstructor(parent=self) # table parser @property @abstractmethod def working_bbox(self): '''Working bbox of current Layout.''' def store(self): '''Store parsed layout in dict format.''' res = super().store() # Element res.update({ 'blocks': self.blocks.store(), 'shapes': self.shapes.store() }) return res def restore(self, data:dict): '''Restore Layout from parsed results.''' self.update_bbox(data.get('bbox', (0,)*4)) self.blocks.restore(data.get('blocks', [])) self.shapes.restore(data.get('shapes', [])) return self def assign_blocks(self, blocks:list): '''Add blocks (line or table block) to this layout. Args: blocks (list): a list of text line or table block to add. .. note:: If a text line is partly contained, it must deep into span -> char. ''' for block in blocks: self._assign_block(block) def assign_shapes(self, shapes:list): '''Add shapes to this cell. Args: shapes (list): a list of Shape instance to add. ''' # add shape if contained in cell for shape in shapes: if self.working_bbox.intersects(shape.bbox): self.shapes.append(shape) def parse(self, **settings): '''Parse layout. Args: settings (dict): Layout parsing parameters. ''' if not self.blocks: return # parse tables self._parse_table(**settings) # parse paragraphs self._parse_paragraph(**settings) # parse sub-layout, i.e. cell layouts under table block for block in filter(lambda e: e.is_table_block, self.blocks): block.parse(**settings) def _assign_block(self, block): '''Add block (line or table block) to this layout.''' # add block directly if fully contained in cell if self.contains(block, threshold=constants.FACTOR_MAJOR): self.blocks.append(block) # deep into line span if any intersection elif isinstance(block, Line) and self.bbox.intersects(block.bbox): self.blocks.append(block.intersects(self.bbox)) def _parse_table(self, **settings): '''Parse table layout: * detect explicit tables first based on shapes, * then stream tables based on original text blocks and parsed explicit tables; * move table contained blocks (text block or explicit table) to associated cell layout. ''' # parse table structure/format recognized from explicit shapes if settings['parse_lattice_table']: self._table_parser.lattice_tables( settings['connected_border_tolerance'], settings['min_border_clearance'], settings['max_border_width']) # parse table structure based on implicit layout of text blocks if settings['parse_stream_table']: self._table_parser.stream_tables( settings['min_border_clearance'], settings['max_border_width'], settings['line_separate_threshold']) def _parse_paragraph(self, **settings): '''Create text block based on lines, and parse text format, e.g. text highlight, paragraph indentation ''' # group lines to text block self.blocks.parse_block( settings['max_line_spacing_ratio'], settings['line_break_free_space_ratio'], settings['new_paragraph_free_space_ratio']) # parse text format, e.g. highlight, underline self.blocks.parse_text_format( self.shapes.text_style_shapes, settings['delete_end_line_hyphen']) # paragraph / line spacing self.blocks.parse_spacing( settings['line_separate_threshold'], settings['line_break_width_ratio'], settings['line_break_free_space_ratio'], settings['lines_left_aligned_threshold'], settings['lines_right_aligned_threshold'], settings['lines_center_aligned_threshold']) pdf2docx-0.5.8/pdf2docx/layout/Section.py000066400000000000000000000051731455352127700202620ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Section of Page. In most cases, one section per page. But in case multi-columns page, sections are used to distinguish these different layouts. .. note:: Currently, support at most two columns. :: { 'bbox': (x0,y0,x1,y1) 'num_cols': 1, 'space': 0, 'columns': [{ ... # column properties }, ...] } ''' from docx.enum.section import WD_SECTION from ..common.docx import set_columns from ..common.Collection import BaseCollection from .Column import Column class Section(BaseCollection): def __init__(self, space:int=0, columns:list=None, parent=None): """Initialize Section instance. Args: space (int, optional): Space between adjacent columns. Defaults to 0. columns (list, optional): A list of Column instances. Defaults to None. parent (Sections, optional): Parent element. Defaults to None. """ self.space = space self.before_space = 0.0 super().__init__(columns, parent) @property def num_cols(self): return len(self) def store(self): '''Store parsed section layout in dict format.''' return { 'bbox' : tuple([x for x in self.bbox]), 'num_cols' : self.num_cols, 'space' : self.space, 'before_space' : self.before_space, 'columns': super().store() } def restore(self, raw:dict): '''Restore section from source dict.''' # bbox is maintained automatically based on columns self.space = raw.get('space', 0) # space between adjacent columns self.before_space = raw.get('before_space', 0) # space between adjacent columns # get each column for raw_col in raw.get('columns', []): column = Column().restore(raw_col) self.append(column) return self def parse(self, **settings): '''Parse section layout.''' for column in self: column.parse(**settings) return self def make_docx(self, doc): '''Create section in docx. Args: doc (Document): ``python-docx`` document object ''' # set section column section = doc.sections[-1] width_list = [c.bbox[2]-c.bbox[0] for c in self] set_columns(section, width_list, self.space) # add create each column for column in self: # column break to start new column if column != self[0]: doc.add_section(WD_SECTION.NEW_COLUMN) # make doc column.make_docx(doc)pdf2docx-0.5.8/pdf2docx/layout/Sections.py000066400000000000000000000063051455352127700204430ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Collection of :py:class:`~pdf2docx.layout.Section` instances. ''' from docx.enum.section import WD_SECTION from docx.shared import Pt from ..common.Collection import BaseCollection from ..common.docx import reset_paragraph_format from .Section import Section from ..common import constants class Sections(BaseCollection): def restore(self, raws:list): """Restore sections from source dicts.""" self.reset() for raw in raws: section = Section().restore(raw) self.append(section) return self def parse(self, **settings): '''Parse layout under section level.''' for section in self: section.parse(**settings) return self def make_docx(self, doc): '''Create sections in docx.''' if not self: return # mark paragraph index before creating current page n = len(doc.paragraphs) def create_dummy_paragraph_for_section(section): p = doc.add_paragraph() line_height = min(section.before_space, 11) pf = reset_paragraph_format(p, line_spacing=Pt(line_height)) pf.space_after = Pt(section.before_space-line_height) # --------------------------------------------------- # first section # --------------------------------------------------- # vertical position: add dummy paragraph only if before space is required section = self[0] if section.before_space > constants.MINOR_DIST: create_dummy_paragraph_for_section(section) # create first section if section.num_cols==2: doc.add_section(WD_SECTION.CONTINUOUS) section.make_docx(doc) # --------------------------------------------------- # more sections # --------------------------------------------------- for section in self[1:]: # create new section symbol doc.add_section(WD_SECTION.CONTINUOUS) # set after space of last paragraph to define the vertical # position of current section # NOTE: the after space doesn't work if last paragraph is # image only (without any text). In this case, set after # space for the section break. p = doc.paragraphs[-2] # -1 is the section break if not p.text.strip() and 'graphicData' in p._p.xml: p = doc.paragraphs[-1] pf = p.paragraph_format pf.space_after = Pt(section.before_space) # section content section.make_docx(doc) # --------------------------------------------------- # create floating images # --------------------------------------------------- # lazy: assign all float images to first paragraph of current page for image in self.parent.float_images: image.make_docx(doc.paragraphs[n]) def plot(self, page): '''Plot all section blocks for debug purpose.''' for section in self: for column in section: column.plot(page, stroke=(1,1,0), width=1.5) # column bbox column.blocks.plot(page) # blockspdf2docx-0.5.8/pdf2docx/layout/__init__.py000066400000000000000000000000001455352127700203750ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/main.py000066400000000000000000000110451455352127700162600ustar00rootroot00000000000000'''Entry for ``pdf2docx`` command line.''' import logging from .converter import Converter class PDF2DOCX: '''Command line interface for ``pdf2docx``.''' @staticmethod def convert(pdf_file:str, docx_file:str=None, password:str=None, start:int=0, end:int=None, pages:list=None, **kwargs): '''Convert pdf file to docx file. Args: pdf_file (str) : PDF filename to read from. docx_file (str, optional): docx filename to write to. Defaults to None. password (str): Password for encrypted pdf. Default to None if not encrypted. start (int, optional): First page to process. Defaults to 0. end (int, optional): Last page to process. Defaults to None. pages (list, optional): Range of pages, e.g. --pages=1,3,5. Defaults to None. kwargs (dict) : Configuration parameters. .. note:: Refer to :py:meth:`~pdf2docx.converter.Converter.convert` for detailed description on above arguments. ''' # index starts from zero or one if isinstance(pages, int): pages = [pages] # in case --pages=1 if not kwargs.get('zero_based_index', True): start = max(start-1, 0) if end: end -= 1 if pages: pages = [i-1 for i in pages] cv = Converter(pdf_file, password) try: cv.convert(docx_file, start, end, pages, **kwargs) except Exception as e: logging.error(e) finally: cv.close() @staticmethod def debug(pdf_file:str, password:str=None, page:int=0, docx_file:str=None, debug_pdf:str=None, layout_file:str='layout.json', **kwargs): '''Convert one PDF page and plot layout information for debugging. Args: pdf_file (str) : PDF filename to read from. password (str): Password for encrypted pdf. Default to None if not encrypted. page (int, optional): Page index to convert. docx_file (str, optional): docx filename to write to. debug_pdf (str, optional): Filename for new pdf storing layout information. Defaults to same name with pdf file. layout_file (str, optional): Filename for new json file storing parsed layout data. Defaults to ``layout.json``. kwargs (dict) : Configuration parameters. ''' # index starts from zero or one if not kwargs.get('zero_based_index', True): page = max(page-1, 0) # explode exception directly if debug mode cv = Converter(pdf_file, password) cv.debug_page(page, docx_file, debug_pdf, layout_file, **kwargs) cv.close() @staticmethod def table(pdf_file, password:str=None, start:int=0, end:int=None, pages:list=None, **kwargs): '''Extract table content from pdf pages. Args: pdf_file (str) : PDF filename to read from. password (str): Password for encrypted pdf. Default to None if not encrypted. start (int, optional): First page to process. Defaults to 0. end (int, optional): Last page to process. Defaults to None. pages (list, optional): Range of pages, e.g. --pages=1,3,5. Defaults to None. ''' # index starts from zero or one if isinstance(pages, int): pages = [pages] # in case --pages=1 if not kwargs.get('zero_based_index', True): start = max(start-1, 0) if end: end -= 1 if pages: pages = [i-1 for i in pages] cv = Converter(pdf_file, password) try: tables = cv.extract_tables(start, end, pages, **kwargs) except Exception as e: tables = [] logging.error(e) finally: cv.close() return tables @staticmethod def gui(): '''Simple user interface.''' # import App containing tkinter internally, in case GUI is not supported by some platforms, # e.g. Amazon Linux 2 try: from .gui.App import App except Exception: logging.error('GUI is not supported in current platform.') else: app = App(title='PDF_2_Docx Converter', width=500, height=600) app.mainloop() parse = PDF2DOCX.convert def main(): '''Command line entry.''' import fire fire.Fire(PDF2DOCX) if __name__ == '__main__': main() pdf2docx-0.5.8/pdf2docx/page/000077500000000000000000000000001455352127700156755ustar00rootroot00000000000000pdf2docx-0.5.8/pdf2docx/page/BasePage.py000066400000000000000000000015031455352127700177150ustar00rootroot00000000000000'''Base page with basic properties, e.g. width, height and margin.''' class BasePage: def __init__(self, width:float=0.0, height:float=0.0, margin:tuple=None): '''Initialize page layout. Args: width (float, optional): Page width. Defaults to 0.0. height (float, optional): Page height. Defaults to 0.0. margin (tuple, optional): Page margin. Defaults to None. ''' # page size and margin self.width = width self.height = height self.margin = margin or (0,) * 4 @property def bbox(self): return (0.0, 0.0, self.width, self.height) @property def working_bbox(self): '''bbox with margin considered.''' x0, y0, x1, y1 = self.bbox L, R, T, B = self.margin return (x0+L, y0+T, x1-R, y1-B)pdf2docx-0.5.8/pdf2docx/page/Page.py000066400000000000000000000154651455352127700171360ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Page object parsed with PDF raw dict. In addition to base structure described in :py:class:`~pdf2docx.page.RawPage`, some new features, e.g. sections, table block, are also included. Page elements structure: * :py:class:`~pdf2docx.page.Page` >> :py:class:`~pdf2docx.layout.Section` >> :py:class:`~pdf2docx.layout.Column` * :py:class:`~pdf2docx.layout.Blocks` * :py:class:`~pdf2docx.text.TextBlock` >> :py:class:`~pdf2docx.text.Line` >> :py:class:`~pdf2docx.text.TextSpan` / :py:class:`~pdf2docx.image.ImageSpan` >> :py:class:`~pdf2docx.text.Char` * :py:class:`~pdf2docx.table.TableBlock` >> :py:class:`~pdf2docx.table.Row` >> :py:class:`~pdf2docx.table.Cell` * :py:class:`~pdf2docx.layout.Blocks` * :py:class:`~pdf2docx.shape.Shapes` * :py:class:`~pdf2docx.shape.Shapes` * :py:class:`~pdf2docx.shape.Shape.Stroke` * :py:class:`~pdf2docx.shape.Shape.Fill` * :py:class:`~pdf2docx.shape.Shape.Hyperlink` :: { "id": 0, # page index "width" : w, "height": h, "margin": [left, right, top, bottom], "sections": [{ ... # section properties }, ...], "floats": [{ ... # floating picture }, ...] } ''' from docx.shared import Pt from docx.enum.section import WD_SECTION from ..common.Collection import BaseCollection from ..common.share import debug_plot from .BasePage import BasePage from ..layout.Sections import Sections from ..image.ImageBlock import ImageBlock class Page(BasePage): '''Object representing the whole page, e.g. margins, sections.''' def __init__(self, id:int=-1, skip_parsing:bool=True, width:float=0.0, height:float=0.0, header:str=None, footer:str=None, margin:tuple=None, sections:Sections=None, float_images:BaseCollection=None): '''Initialize page layout. Args: id (int, optional): Page index. Defaults to -1. skip_parsing (bool, optional): Don't parse page if True. Defaults to True. width (float, optional): Page width. Defaults to 0.0. height (float, optional): Page height. Defaults to 0.0. header (str, optional): Page header. Defaults to None. footer (str, optional): Page footer. Defaults to None. margin (tuple, optional): Page margin. Defaults to None. sections (Sections, optional): Page contents. Defaults to None. float_images (BaseCollection, optional): Float images in th is page. Defaults to None. ''' # page index self.id = id self.skip_parsing = skip_parsing # page size and margin super().__init__(width=width, height=height, margin=margin) # flow structure: # Section -> Column -> Blocks -> TextBlock/TableBlock # TableBlock -> Row -> Cell -> Blocks self.sections = sections or Sections(parent=self) # page header, footer self.header = header or '' self.footer = footer or '' # floating images are separate node under page self.float_images = float_images or BaseCollection() self._finalized = False @property def finalized(self): return self._finalized def store(self): '''Store parsed layout in dict format.''' res = { 'id' : self.id, 'width' : self.width, 'height' : self.height, 'margin' : self.margin, 'sections': self.sections.store(), 'header' : self.header, 'footer' : self.footer, 'floats' : self.float_images.store() } return res def restore(self, data:dict): '''Restore Layout from parsed results.''' # page id self.id = data.get('id', -1) # page width/height self.width = data.get('width', 0.0) self.height = data.get('height', 0.0) self.margin = data.get('margin', (0,) * 4) # parsed layout self.sections.restore(data.get('sections', [])) self.header = data.get('header', '') self.footer = data.get('footer', '') # float images self._restore_float_images(data.get('floats', [])) # Suppose layout is finalized when restored; otherwise, set False explicitly # out of this method. self._finalized = True return self @debug_plot('Final Layout') def parse(self, **settings): '''Parse page layout.''' self.sections.parse(**settings) self._finalized = True return self.sections # for debug plot def extract_tables(self, **settings): '''Extract content from tables (top layout only). .. note:: Before running this method, the page layout must be either parsed from source page or restored from parsed data. ''' # table blocks collections = [] for section in self.sections: for column in section: if settings['extract_stream_table']: collections.extend(column.blocks.table_blocks) else: collections.extend(column.blocks.lattice_table_blocks) # check table tables = [] # type: list[ list[list[str]] ] for table_block in collections: tables.append(table_block.text) return tables def make_docx(self, doc): '''Set page size, margin, and create page. .. note:: Before running this method, the page layout must be either parsed from source page or restored from parsed data. Args: doc (Document): ``python-docx`` document object ''' # new page if doc.paragraphs: section = doc.add_section(WD_SECTION.NEW_PAGE) else: section = doc.sections[0] # a default section is there when opening docx # page size section.page_width = Pt(self.width) section.page_height = Pt(self.height) # page margin left,right,top,bottom = self.margin section.left_margin = Pt(left) section.right_margin = Pt(right) section.top_margin = Pt(top) section.bottom_margin = Pt(bottom) # create flow layout: sections self.sections.make_docx(doc) def _restore_float_images(self, raws:list): '''Restore float images.''' self.float_images.reset() for raw in raws: image = ImageBlock(raw) image.set_float_image_block() self.float_images.append(image) pdf2docx-0.5.8/pdf2docx/page/Pages.py000066400000000000000000000064221455352127700173120ustar00rootroot00000000000000# -*- coding: utf-8 -*- '''Collection of :py:class:`~pdf2docx.page.Page` instances.''' import logging from .RawPageFactory import RawPageFactory from ..common.Collection import BaseCollection from ..font.Fonts import Fonts class Pages(BaseCollection): '''A collection of ``Page``.''' def parse(self, fitz_doc, **settings): '''Analyze document structure, e.g. page section, header, footer. Args: fitz_doc (fitz.Document): ``PyMuPDF`` Document instance. settings (dict): Parsing parameters. ''' # --------------------------------------------- # 0. extract fonts properties, especially line height ratio # --------------------------------------------- fonts = Fonts.extract(fitz_doc) # --------------------------------------------- # 1. extract and then clean up raw page # --------------------------------------------- pages, raw_pages = [], [] words_found = False for page in self: if page.skip_parsing: continue # init and extract data from PDF raw_page = RawPageFactory.create(page_engine=fitz_doc[page.id], backend='PyMuPDF') raw_page.restore(**settings) # check if any words are extracted since scanned pdf may be directed if not words_found and raw_page.raw_text.strip(): words_found = True # process blocks and shapes based on bbox raw_page.clean_up(**settings) # process font properties raw_page.process_font(fonts) # after this step, we can get some basic properties # NOTE: floating images are detected when cleaning up blocks, so collect them here page.width = raw_page.width page.height = raw_page.height page.float_images.reset().extend(raw_page.blocks.floating_image_blocks) raw_pages.append(raw_page) pages.append(page) # show message if no words found if not words_found: logging.warning('Words count: 0. It might be a scanned pdf, which is not supported yet.') # --------------------------------------------- # 2. parse structure in document/pages level # --------------------------------------------- # NOTE: blocks structure might be changed in this step, e.g. promote page header/footer, # so blocks structure based process, e.g. calculating margin, parse section should be # run after this step. header, footer = Pages._parse_document(raw_pages) # --------------------------------------------- # 3. parse structure in page level, e.g. page margin, section # --------------------------------------------- # parse sections for page, raw_page in zip(pages, raw_pages): # page margin margin = raw_page.calculate_margin(**settings) raw_page.margin = page.margin = margin # page section sections = raw_page.parse_section(**settings) page.sections.extend(sections) @staticmethod def _parse_document(raw_pages:list): '''Parse structure in document/pages level, e.g. header, footer''' # TODO return '', ''pdf2docx-0.5.8/pdf2docx/page/RawPage.py000066400000000000000000000233341455352127700176020ustar00rootroot00000000000000'''A wrapper of pdf page engine (e.g. PyMuPDF, pdfminer) to do the following work: * extract source contents * clean up blocks/shapes, e.g. elements out of page * calculate page margin * parse page structure roughly, i.e. section and column ''' from abc import (ABC, abstractmethod) from .BasePage import BasePage from ..layout.Section import Section from ..layout.Column import Column from ..shape.Shape import Hyperlink from ..shape.Shapes import Shapes from ..layout.Blocks import Blocks from ..font.Fonts import Fonts from ..text.TextSpan import TextSpan from ..common.share import debug_plot from ..common import constants from ..common.Collection import Collection class RawPage(BasePage, ABC): '''A wrapper of page engine.''' def __init__(self, page_engine=None): ''' Initialize page layout. Args: page_engine (Object): Source pdf page. ''' BasePage.__init__(self) self.page_engine = page_engine self.blocks = Blocks(parent=self) self.shapes = Shapes(parent=self) @abstractmethod def extract_raw_dict(self, **settings): '''Extract source data with page engine. Return a dict with the following structure: ``` { "width" : w, "height": h, "blocks": [{...}, {...}, ...], "shapes" : [{...}, {...}, ...] } ``` ''' @property def text(self): '''All extracted text in this page, with images considered as ````. Should be run after ``restore()`` data.''' return '\n'.join([block.text for block in self.blocks]) @property def raw_text(self): '''Extracted raw text in current page. Should be run after ``restore()`` data.''' return '\n'.join([block.raw_text for block in self.blocks]) @debug_plot('Source Text Blocks') def restore(self, **settings): '''Initialize layout extracted with ``PyMuPDF``.''' raw_dict = self.extract_raw_dict(**settings) self.blocks.restore(raw_dict.get('blocks', [])) self.shapes.restore(raw_dict.get('shapes', [])) return self.blocks @debug_plot('Cleaned Shapes') def clean_up(self, **settings): '''Clean up raw blocks and shapes, e.g. * remove negative or duplicated instances, * detect semantic type of shapes ''' # clean up blocks first self.blocks.clean_up( settings['float_image_ignorable_gap'], settings['line_overlap_threshold']) # clean up shapes self.shapes.clean_up( settings['max_border_width'], settings['shape_min_dimension']) return self.shapes def process_font(self, fonts:Fonts): '''Update font properties, e.g. font name, font line height ratio, of ``TextSpan``. Args: fonts (Fonts): Fonts parsed by ``fonttools``. ''' # get all text span spans = [] for line in self.blocks: spans.extend([span for span in line.spans if isinstance(span, TextSpan)]) # check and update font name, line height for span in spans: font = fonts.get(span.font) if not font: continue # update font properties with font parsed by fonttools span.font = font.name if font.line_height: span.line_height = font.line_height * span.size def calculate_margin(self, **settings): """Calculate page margin. .. note:: Ensure this method is run right after cleaning up the layout, so the page margin is calculated based on valid layout, and stay constant. """ # Exclude hyperlink from shapes because hyperlink might exist out of page unreasonably, # while it should always within page since attached to text. shapes = Shapes([shape for shape in self.shapes if not isinstance(shape, Hyperlink)]) # return default margin if no blocks exist if not self.blocks and not shapes: return (constants.ITP, ) * 4 x0, y0, x1, y1 = self.bbox u0, v0, u1, v1 = self.blocks.bbox | shapes.bbox # margin left = max(u0-x0, 0.0) right = max(x1-u1-constants.MINOR_DIST, 0.0) top = max(v0-y0, 0.0) bottom = max(y1-v1, 0.0) # reduce calculated top/bottom margin to leave some free space top *= settings['page_margin_factor_top'] bottom *= settings['page_margin_factor_bottom'] # use normal margin if calculated margin is large enough return ( min(constants.ITP, round(left, 1)), min(constants.ITP, round(right, 1)), min(constants.ITP, round(top, 1)), min(constants.ITP, round(bottom, 1))) def parse_section(self, **settings): '''Detect and create page sections. .. note:: - Only two-columns Sections are considered for now. - Page margin must be parsed before this step. ''' # bbox X0, Y0, X1, _ = self.working_bbox # collect all blocks (line level) and shapes elements = Collection() elements.extend(self.blocks) elements.extend(self.shapes) if not elements: return # to create section with collected lines lines = Collection() sections = [] def close_section(num_col, elements, y_ref): # append to last section if both single column if sections and sections[-1].num_cols==num_col==1: column = sections[-1][0] # type: Column column.union_bbox(elements) column.add_elements(elements) # otherwise, create new section else: section = self._create_section(num_col, elements, (X0, X1), y_ref) if section: sections.append(section) # check section row by row pre_num_col = 1 y_ref = Y0 # to calculate v-distance between sections for row in elements.group_by_rows(): # check column col by col cols = row.group_by_columns() current_num_col = len(cols) # column check: # consider 2-cols only if current_num_col>2: current_num_col = 1 # the width of two columns shouldn't have significant difference elif current_num_col==2: u0, v0, u1, v1 = cols[0].bbox m0, n0, m1, n1 = cols[1].bbox x0 = (u1+m0)/2.0 c1, c2 = x0-X0, X1-x0 # column width w1, w2 = u1-u0, m1-m0 # line width f = 2.0 if not 1/f<=c1/c2<=f or w1/c1<0.33 or w2/c2<0.33: current_num_col = 1 # process exceptions if pre_num_col==2 and current_num_col==1: # though current row has one single column, it might have another virtual # and empty column. If so, it should be counted as 2-cols cols = lines.group_by_columns() pos = cols[0].bbox[2] if row.bbox[2]<=pos or row.bbox[0]>pos: current_num_col = 2 # pre_num_col!=current_num_col => to close section with collected lines, # before that, further check the height of collected lines else: x0, y0, x1, y1 = lines.bbox if y1-y0