pax_global_header 0000666 0000000 0000000 00000000064 15077504734 0014526 g ustar 00root root 0000000 0000000 52 comment=7058a9e577b3529f1a494b7166a8019ef4c556f4
ebooklib-0.20/ 0000775 0000000 0000000 00000000000 15077504734 0013235 5 ustar 00root root 0000000 0000000 ebooklib-0.20/.coveragerc 0000664 0000000 0000000 00000000745 15077504734 0015364 0 ustar 00root root 0000000 0000000 [run]
# Specify directories/files to include for coverage measurement
source = ebooklib/
# Specify directories/files to exclude from coverage
omit = samples/*, tests/*
[report]
# Fail if total coverage is below this percentage
fail_under = 65
# Exclude lines matching these regular expressions
exclude_lines =
pragma: no cover
if TYPE_CHECKING:
raise NotImplementedError
if __name__ == .__main__.:
[html]
# Directory for the HTML report
directory = htmlcov_reports
ebooklib-0.20/.github/ 0000775 0000000 0000000 00000000000 15077504734 0014575 5 ustar 00root root 0000000 0000000 ebooklib-0.20/.github/workflows/ 0000775 0000000 0000000 00000000000 15077504734 0016632 5 ustar 00root root 0000000 0000000 ebooklib-0.20/.github/workflows/ci-tests.yml 0000664 0000000 0000000 00000002337 15077504734 0021115 0 ustar 00root root 0000000 0000000 name: Test on Pull Request
on:
pull_request:
jobs:
verify:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v5
- name: Run ruff format check
uses: astral-sh/ruff-action@v3
with:
args: 'format --check --diff'
- name: Run ruff lint check
uses: astral-sh/ruff-action@v3
with:
args: 'check --show-fixes'
test:
needs: [verify]
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v5
- name: Make test script executable
run: chmod +x ./test.sh
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Test with python ${{ matrix.python-version }}
run: |
python -m pip install --upgrade pip
pip install -e ".[test]"
pip install pygments
./test.sh
- name: Upload coverage reports
uses: actions/upload-artifact@v4
with:
name: coverage-report-python-${{ matrix.python-version }}
path: test_reports/coverage/
ebooklib-0.20/.gitignore 0000664 0000000 0000000 00000002472 15077504734 0015232 0 ustar 00root root 0000000 0000000 #ignore these in git listings
*~
.emacs*
.DS_Store
.swp
*.swp
\#*#
.#*
\#*
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
### JetBrains template
.idea/
.vscode
ebooklib-0.20/.readthedocs.yaml 0000664 0000000 0000000 00000001142 15077504734 0016462 0 ustar 00root root 0000000 0000000 # Read the Docs configuration file for Sphinx projects
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.12"
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/conf.py
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
# python:
# install:
# - requirements: docs/requirements.txt
ebooklib-0.20/AUTHORS.txt 0000664 0000000 0000000 00000003355 15077504734 0015131 0 ustar 00root root 0000000 0000000 Listed in in chronological order:
Aleksandar Erkalovic
Borko Jandras
the-happy-hippo
Daniel James
Mikhail Gusarov
Raymond Yee
Chris Grice
Fernando Wime
Andy Roberts
Tom McLean
Travis L
Helmy Giacoman
clach04
Edward Betts
Kennyl
Oleg Pshenichniy
clach04
Olivier Le Thanh Duong
Michael Storm
Sergey Markelov
Laurent Laporte
Deborah Kaplan
Willem van der Walt
Peter <82152909+repeterat@users.noreply.github.com>
u8slvn
tugot17
张悦
Morphus
Kian-Meng, Ang
Jared Wong
Michael Timblin
Hsiu-Ming Chang
Bono Lv
Tom Ritchford
Juan Pablo Delgado
Tom Kuson
Samuel Clay
plu5
Youssif Shaaban Alsager
Soroush Javadi
Milad
Tyler Whipple
João Seckler
Yuan-Yi Chang
Nicolas Ivan Milicevic
林檎
ebooklib-0.20/CHANGES.txt 0000664 0000000 0000000 00000021407 15077504734 0015052 0 ustar 00root root 0000000 0000000 EbookLib Changelog
==================
All notable changes to this project will be documented in this file.
Version 0.20 (2025-10-26)
------------------
Added
~~~~~
* Add Read the docs configuration file (#333)
* Add pytest support and GitHub workflow (#349)
* Add extension method for to EpubHtml object (#298)
Changed
~~~~~~~
* Reformat all code with ruff, fix docs
* Update the list of software using ebooklib
* Increase the priority of inserting into
* Return bytes in EpubHtml's content getter methods (#319)
Fixed
~~~~~
* Fix the version in Sphinx documentation (#335)
* Bug fix: Prevent None join error when HREF is not found on a link_node (#328)
* Fix item.get_type() does not recognize webm (#312, #338)
* Add more informative text for future Contributors (#333)
Version 0.19 (2025-05-02)
-------------------------
Added
~~~~~
* Add option to set toc page direction (#234)
* Add compresslevel parameter for zip compression (#287)
Changed
~~~~~~~
* Update README.md with refreshing the project issue
* Use posix module instead of os.path when managing EPUB references (#305)
* Change deprecation check for ignore_ncx
* Using lxml text_content() on all nodes for better ToC parsing
Fixed
~~~~~
* Throwing errors while writing EPUB is optional now and behavior will be changed in the future (#316)
* Fix lxml root warning (#305)
* Toning down metadata failure to an info log
* Check truth value of ignore_ncx deprecation, not just truthiness (#283)
* Add direction to the epub Nav page only if it was set
Version 0.18 (2022-11-30)
-------------------------
Added
~~~~~
* Add an example for embedding images into paragraph
* Add default option ignore_ncx to EpubReader and document it (#183)
* Add option to set custom title in toc page (#243)
* Add `scripted` to EpubHtml properties if a script is added (#176)
* Read un-zipped epub directories (#183)
* Add direction for HTML tags (#174)
Changed
~~~~~~~
* Update AUTHORS.txt file
* Improved header in README.md
* Updated README.md; Added python linting, updated file links
* Use custom XMLParser for security
* Use double quotes for XML declaration in xml container
Fixed
~~~~~
* Define format for the long description
* Fix dublin core typo in tutorial documentation (#256)
* Fix EpubImage lacks parameters (#231)
* Fix the issue toc can not loaded from epub file
* Fixed #218 - Use correct variable for static item ids
* Ignore NCX if Navigation Document exists
* Try hard to parse through broken XML, improve the compatibility
Version 0.17.1 (2019-01-03)
---------------------------
Added
~~~~~
* Initial SMIL support (#08_SMIL sample)
* Adding support for the EPUB3 pagebreak/page-list semantics
* Page lists are now attached to each EpubHtml item as well as to the book
Fixed
~~~~~
* Increase version number to 0.17.1
* Update README.md
* Update AUTHORS.txt file
* Fixed #168 - Improve Sphinx documentation a bit
* Fixed #166 - Readfile method fails on MS Windows
* Fixed #165 - Sample file 05_plugins_create files on Python 3+
* Fixed #164 - Parsing files fails because of page-list
* Fixed #163 - epub:type is not properly extracted
Version 0.17 (2018-07-04)
-------------------------
Added
~~~~~
* Add WAI-ARIA role to the nav element for accessibility
* Add ability to include playOrder into navPoint (#146)
* Add get_type method to EpubCover class (#147)
Changed
~~~~~~~
* Improve playOrder logic to handle subchapters correctly (#148)
* Improved `.gitignore` (#144)
* Make `samples/04_markdown_parse` Python 2+3 compatible (#143)
* Dynamically change the long-description in project configuration
* Change double quotes into single quotes
Fixed
~~~~~
* Fixed #161 - Provide default value for metadata
* Fixed #159 - Add doc-toc role programmatically for EPUB accessibility
* Fixed #137 - Create a PyPi compatible README.md file
* Fixed #139 - Correct MANIFEST.in file and rename CHANGES.TXT to CHANGES.txt
* Fixed #132 - Read the README.md file using utf-8 encoding
* Fixed #125 - Refactor writing of OPF file
* Fixed #123 - Remove unused test files
* Fixed #121 - Fix relative path resolving
* Don't cut off first character of body in EpubHtml.get_body_content()
* Fix for pop-up footnotes in EPUB 3
Version 0.16 (2017-04-03)
-------------------------
Added
~~~~~
* Add ability to set custom prefixes and namespaces to content.opf file
* Add bindings support (#70)
Changed
~~~~~~~
* Remove Booktype version number
* Reconstruct options for legacy tidy
* Control attribute order of meta tag
* Update Readme with new project that uses ebooklib
Fixed
~~~~~
* Fixed #119 - Plugin samples file uses obsolete API
* Fixed #112 - Get body content calls get content for no reason
* Fixed #111 - Initialize mimetype library only when it is being used
* Fixed #110 - Add .ttf extension to the list of font files
* Fixed #109 - spine: text orientation attribute
* Fixed #101 - Update list of authors
* Fixed #100 - Fix docstrings
* Fixed #99 - Change README.md to have ASCII encoding
* Fixed #98 - Underline tag gets deleted
* Fixed #97 - Sections are not pointing to any specific content
* Fixed #92 - set.cover not working in Python 3.4
* Fixed #85 - Epub2 guide types do not map completely to epub3 landmark types
* Fixed #84 - Set unique metadata function
* Fixed #83 - In the sample file read cover image in binary mode
* Fixed #82 - Ignore vim swapfiles
* Fixed #81 - Mimetypes library isn't aware of xhtml
* Fixed #72 - example for reading epub didn't work, wrong module level for ITEM_IMAGE
* Fixed #71 - Handling sections with content
* Fixed #69 - Correct rendition URL in OPF file
* Fixed #64 - embed
_lnk.text = ""
else:
_lnk = etree.SubElement(_head, "link", lnk)
# this should not be like this
# head = html_root.find('head')
# if head is not None:
# for i in head.getchildren():
# if i.tag == 'title' and self.title != '':
# continue
# _head.append(i)
# create and populate body
_body = etree.SubElement(tree_root, "body")
if self.direction:
_body.set("dir", self.direction)
tree_root.set("dir", self.direction)
body = html_tree.find("body")
if body is not None:
for i in body.getchildren():
_body.append(i)
tree_str = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True)
return tree_str
def __str__(self):
return "".format(id=self.id, file_name=self.file_name) # noqa: UP032
class EpubCoverHtml(EpubHtml):
"""
Represents Cover page in the EPUB file.
"""
def __init__(self, uid="cover", file_name="cover.xhtml", image_name="", title="Cover"):
super(EpubCoverHtml, self).__init__(uid=uid, file_name=file_name, title=title) # noqa: UP008
self.image_name = image_name
self.is_linear = False
def is_chapter(self):
"""
Returns if this document is chapter or not.
:Returns:
Returns book value.
"""
return False
def get_content(self):
"""
Returns content for cover page as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).
:Returns:
Returns content of this document.
"""
self.content = self.book.get_template("cover")
tree = parse_string(super(EpubCoverHtml, self).get_content()) # noqa: UP008
tree_root = tree.getroot()
images = tree_root.xpath("//xhtml:img", namespaces={"xhtml": NAMESPACES["XHTML"]})
images[0].set("src", self.image_name)
images[0].set("alt", self.title)
tree_str = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True)
return tree_str
def __str__(self):
return "".format(id=self.id, file_name=self.file_name) # noqa: UP032
class EpubNav(EpubHtml):
"""
Represents Navigation Document in the EPUB file.
"""
def __init__(self, uid="nav", file_name="nav.xhtml", media_type="application/xhtml+xml", title="", direction=None):
super(EpubNav, self).__init__( # noqa: UP008
uid=uid, file_name=file_name, media_type=media_type, title=title, direction=direction
)
def is_chapter(self):
"""
Returns if this document is chapter or not.
:Returns:
Returns book value.
"""
return False
def __str__(self):
return "".format(id=self.id, file_name=self.file_name) # noqa: UP032
class EpubImage(EpubItem):
"""
Represents Image in the EPUB file.
"""
def __init__(self, *args, **kwargs):
super(EpubImage, self).__init__(*args, **kwargs) # noqa: UP008
def get_type(self):
return ebooklib.ITEM_IMAGE
def __str__(self):
return "".format(id=self.id, file_name=self.file_name) # noqa: UP032
class EpubSMIL(EpubItem):
def __init__(self, uid=None, file_name="", content=None):
super(EpubSMIL, self).__init__(uid=uid, file_name=file_name, media_type="application/smil+xml", content=content) # noqa: UP008
def get_type(self):
return ebooklib.ITEM_SMIL
def __str__(self):
return "".format(id=self.id, file_name=self.file_name) # noqa: UP032
# EpubBook
class EpubBook(object): # noqa: UP004
def __init__(self):
self.EPUB_VERSION = None
self.reset()
# we should have options here
def reset(self):
"Initialises all needed variables to default values"
self.metadata = {}
self.items = []
self.spine = []
self.guide = []
self.pages = []
self.toc = []
self.bindings = []
self.IDENTIFIER_ID = "id"
self.FOLDER_NAME = "EPUB"
self._id_html = 0
self._id_image = 0
self._id_static = 0
self.title = ""
self.language = "en"
self.direction = None
self.templates = {"ncx": NCX_XML, "nav": NAV_XML, "chapter": CHAPTER_XML, "cover": COVER_XML}
self.add_metadata(
"OPF",
"generator",
"",
{"name": "generator", "content": "Ebook-lib {}".format(".".join([str(s) for s in VERSION]))},
)
# default to using a randomly-unique identifier if one is not specified manually
self.set_identifier(str(uuid.uuid4()))
# custom prefixes and namespaces to be set to the content.opf doc
self.prefixes = []
self.namespaces = {}
def set_identifier(self, uid):
"""
Sets unique id for this epub
:Args:
- uid: Value of unique identifier for this book
"""
self.uid = uid
self.set_unique_metadata("DC", "identifier", self.uid, {"id": self.IDENTIFIER_ID})
def set_title(self, title):
"""
Set title. You can set multiple titles.
:Args:
- title: Title value
"""
self.title = title
self.add_metadata("DC", "title", self.title)
def set_language(self, lang):
"""
Set language for this epub. You can set multiple languages. Specific items in the book can have
different language settings.
:Args:
- lang: Language code
"""
self.language = lang
self.add_metadata("DC", "language", lang)
def set_direction(self, direction):
"""
:Args:
- direction: Options are "ltr", "rtl" and "default"
"""
self.direction = direction
def set_cover(self, file_name, content, create_page=True):
"""
Set cover and create cover document if needed.
:Args:
- file_name: file name of the cover page
- content: Content for the cover image
- create_page: Should cover page be defined. Defined as bool value (optional). Default value is True.
"""
# as it is now, it can only be called once
c0 = EpubCover(file_name=file_name)
c0.content = content
self.add_item(c0)
if create_page:
c1 = EpubCoverHtml(image_name=file_name)
self.add_item(c1)
self.add_metadata(None, "meta", "", OrderedDict([("name", "cover"), ("content", "cover-img")]))
def add_author(self, author, file_as=None, role=None, uid="creator"):
"Add author for this document"
self.add_metadata("DC", "creator", author, {"id": uid})
if file_as:
self.add_metadata(
None, "meta", file_as, {"refines": "#" + uid, "property": "file-as", "scheme": "marc:relators"}
)
if role:
self.add_metadata(None, "meta", role, {"refines": "#" + uid, "property": "role", "scheme": "marc:relators"})
def add_metadata(self, namespace, name, value, others=None):
"Add metadata"
if namespace in NAMESPACES:
namespace = NAMESPACES[namespace]
if namespace not in self.metadata:
self.metadata[namespace] = {}
if name not in self.metadata[namespace]:
self.metadata[namespace][name] = []
self.metadata[namespace][name].append((value, others))
def get_metadata(self, namespace, name):
"Retrieve metadata"
if namespace in NAMESPACES:
namespace = NAMESPACES[namespace]
return self.metadata[namespace].get(name, [])
def set_unique_metadata(self, namespace, name, value, others=None):
"Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata."
if namespace in NAMESPACES:
namespace = NAMESPACES[namespace]
if namespace in self.metadata and name in self.metadata[namespace]:
self.metadata[namespace][name] = [(value, others)]
else:
self.add_metadata(namespace, name, value, others)
def add_item(self, item):
"""
Add additional item to the book. If not defined, media type and chapter id will be defined
for the item.
:Args:
- item: Item instance
"""
if item.media_type == "":
(has_guessed, media_type) = guess_type(item.get_name().lower())
if has_guessed:
if media_type is not None:
item.media_type = media_type
else:
item.media_type = has_guessed
else:
item.media_type = "application/octet-stream"
if not item.get_id():
# make chapter_, image_ and static_ configurable
if isinstance(item, EpubHtml):
item.id = "chapter_{id_html}".format(id_html=self._id_html) # noqa: UP032
self._id_html += 1
# If there's a page list, append it to the book's page list
self.pages += item.pages
elif isinstance(item, EpubImage):
item.id = "image_{id_image}".format(id_image=self._id_image) # noqa: UP032
self._id_image += 1
else:
item.id = "static_{id_static}".format(id_static=self._id_static) # noqa: UP032
self._id_static += 1
item.book = self
self.items.append(item)
return item
def get_item_with_id(self, uid):
"""
Returns item for defined UID.
>>> book.get_item_with_id('image_001')
:Args:
- uid: UID for the item
:Returns:
Returns item object. Returns None if nothing was found.
"""
for item in self.get_items():
if item.id == uid:
return item
return None
def get_item_with_href(self, href):
"""
Returns item for defined HREF.
>>> book.get_item_with_href('EPUB/document.xhtml')
:Args:
- href: HREF for the item we are searching for
:Returns:
Returns item object. Returns None if nothing was found.
"""
for item in self.get_items():
if item.get_name() == href:
return item
return None
def get_items(self):
"""
Returns all items attached to this book.
:Returns:
Returns all items as tuple.
"""
return (item for item in self.items)
def get_items_of_type(self, item_type):
"""
Returns all items of specified type.
>>> book.get_items_of_type(epub.ITEM_IMAGE)
:Args:
- item_type: Type for items we are searching for
:Returns:
Returns found items as tuple.
"""
return (item for item in self.items if item.get_type() == item_type)
def get_items_of_media_type(self, media_type):
"""
Returns all items of specified media type.
:Args:
- media_type: Media type for items we are searching for
:Returns:
Returns found items as tuple.
"""
return (item for item in self.items if item.media_type == media_type)
def set_template(self, name, value):
"""
Defines templates which are used to generate certain types of pages. When defining new value for the template
we have to use content of type 'str' (Python 2) or 'bytes' (Python 3).
At the moment we use these templates:
- ncx
- nav
- chapter
- cover
:Args:
- name: Name for the template
- value: Content for the template
"""
self.templates[name] = value
def get_template(self, name):
"""
Returns value for the template.
:Args:
- name: template name
:Returns:
Value of the template.
"""
return self.templates.get(name)
def add_prefix(self, name, uri):
"""
Appends custom prefix to be added to the content.opf document
>>> epub_book.add_prefix('bkterms', 'http://booktype.org/')
:Args:
- name: namespave name
- uri: URI for the namespace
"""
self.prefixes.append("{name}: {uri}".format(name=name, uri=uri)) # noqa: UP032
class EpubWriter(object): # noqa: UP004
DEFAULT_OPTIONS = {
"epub2_guide": True,
"epub3_landmark": True,
"epub3_pages": True,
"landmark_title": "Guide",
"pages_title": "Pages",
"spine_direction": True,
"package_direction": False,
"play_order": {"enabled": False, "start_from": 1},
"raise_exceptions": False,
"compresslevel": 6,
}
def __init__(self, name, book, options=None):
self.file_name = name
self.book = book
self.options = dict(self.DEFAULT_OPTIONS)
if options:
self.options.update(options)
self._init_play_order()
def _init_play_order(self):
self._play_order = {"enabled": False, "start_from": 1}
try:
self._play_order["enabled"] = self.options["play_order"]["enabled"]
self._play_order["start_from"] = self.options["play_order"]["start_from"]
except KeyError:
pass
def process(self):
# should cache this html parsing so we don't do it for every plugin
for plg in self.options.get("plugins", []):
if hasattr(plg, "before_write"):
plg.before_write(self.book)
for item in self.book.get_items():
if isinstance(item, EpubHtml):
for plg in self.options.get("plugins", []):
if hasattr(plg, "html_before_write"):
plg.html_before_write(self.book, item)
def _write_container(self):
container_xml = CONTAINER_XML % {"folder_name": self.book.FOLDER_NAME}
self.out.writestr(CONTAINER_PATH, container_xml)
def _write_opf_metadata(self, root):
# This is really not needed
# problem is uppercase/lowercase
# for ns_name, values in six.iteritems(self.book.metadata):
# if ns_name:
# for n_id, ns_url in six.iteritems(NAMESPACES):
# if ns_name == ns_url:
# nsmap[n_id.lower()] = NAMESPACES[n_id]
nsmap = {"dc": NAMESPACES["DC"], "opf": NAMESPACES["OPF"]}
nsmap.update(self.book.namespaces)
metadata = etree.SubElement(root, "metadata", nsmap=nsmap)
el = etree.SubElement(metadata, "meta", {"property": "dcterms:modified"})
if "mtime" in self.options:
mtime = self.options["mtime"]
else:
import datetime
mtime = datetime.datetime.now()
el.text = mtime.strftime("%Y-%m-%dT%H:%M:%SZ")
for ns_name, values in six.iteritems(self.book.metadata):
if ns_name == NAMESPACES["OPF"]:
for values2 in values.values():
for v in values2:
if "property" in v[1] and v[1]["property"] == "dcterms:modified":
continue
try:
el = etree.SubElement(metadata, "meta", v[1])
if v[0]:
el.text = v[0]
except ValueError:
logging.error("Could not create metadata.")
else:
for name, values2 in six.iteritems(values):
for v in values2:
try:
if ns_name:
el = etree.SubElement(metadata, "{%s}%s" % (ns_name, name), v[1]) # noqa: UP031
else:
el = etree.SubElement(metadata, "{name}".format(name=name), v[1]) # noqa: UP032
el.text = v[0]
except ValueError:
logging.info('Could not create metadata "{name}".'.format(name=name)) # noqa: UP032
def _write_opf_manifest(self, root):
manifest = etree.SubElement(root, "manifest")
_ncx_id = None
# mathml, scripted, svg, remote-resources, and switch
# nav
# cover-image
for item in self.book.get_items():
if not item.manifest:
continue
if isinstance(item, EpubNav):
etree.SubElement(
manifest,
"item",
{"href": item.get_name(), "id": item.id, "media-type": item.media_type, "properties": "nav"},
)
elif isinstance(item, EpubNcx):
_ncx_id = item.id
etree.SubElement(
manifest, "item", {"href": item.file_name, "id": item.id, "media-type": item.media_type}
)
elif isinstance(item, EpubCover):
etree.SubElement(
manifest,
"item",
{"href": item.file_name, "id": item.id, "media-type": item.media_type, "properties": "cover-image"},
)
else:
opts = {"href": item.file_name, "id": item.id, "media-type": item.media_type}
if hasattr(item, "properties") and len(item.properties) > 0:
opts["properties"] = " ".join(item.properties)
if hasattr(item, "media_overlay") and item.media_overlay is not None:
opts["media-overlay"] = item.media_overlay
if hasattr(item, "media_duration") and item.media_duration is not None:
opts["duration"] = item.media_duration
etree.SubElement(manifest, "item", opts)
return _ncx_id
def _write_opf_spine(self, root, ncx_id):
spine_attributes = {"toc": ncx_id or "ncx"}
if self.book.direction and self.options["spine_direction"]:
spine_attributes["page-progression-direction"] = self.book.direction
spine = etree.SubElement(root, "spine", spine_attributes)
for _item in self.book.spine:
# this is for now
# later we should be able to fetch things from tuple
is_linear = True
if isinstance(_item, tuple):
item = _item[0]
if len(_item) > 1:
if _item[1] == "no":
is_linear = False
else:
item = _item
if isinstance(item, EpubHtml):
opts = {"idref": item.get_id()}
if not item.is_linear or not is_linear:
opts["linear"] = "no"
elif isinstance(item, EpubItem):
opts = {"idref": item.get_id()}
if not item.is_linear or not is_linear:
opts["linear"] = "no"
else:
opts = {"idref": item}
try:
itm = self.book.get_item_with_id(item)
if not itm.is_linear or not is_linear:
opts["linear"] = "no"
except Exception:
pass
etree.SubElement(spine, "itemref", opts)
def _write_opf_guide(self, root):
# - http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.6
if len(self.book.guide) > 0 and self.options.get("epub2_guide"):
guide = etree.SubElement(root, "guide", {})
for item in self.book.guide:
if "item" in item:
chap = item.get("item")
if chap:
_href = chap.file_name
_title = chap.title
else:
_href = item.get("href", "")
_title = item.get("title", "")
if _title is None:
_title = ""
_ref = etree.SubElement(
guide, "reference", {"type": item.get("type", ""), "title": _title, "href": _href}
)
def _write_opf_bindings(self, root):
if len(self.book.bindings) > 0:
bindings = etree.SubElement(root, "bindings", {})
for item in self.book.bindings:
etree.SubElement(bindings, "mediaType", item)
def _write_opf_file(self, root):
tree_str = etree.tostring(root, pretty_print=True, encoding="utf-8", xml_declaration=True)
self.out.writestr("{FOLDER_NAME}/content.opf".format(FOLDER_NAME=self.book.FOLDER_NAME), tree_str) # noqa: UP032
def _write_opf(self):
package_attributes = {
"xmlns": NAMESPACES["OPF"],
"unique-identifier": self.book.IDENTIFIER_ID,
"version": "3.0",
}
if self.book.direction and self.options["package_direction"]:
package_attributes["dir"] = self.book.direction
root = etree.Element("package", package_attributes)
prefixes = ["rendition: http://www.idpf.org/vocab/rendition/#"] + self.book.prefixes
root.attrib["prefix"] = " ".join(prefixes)
# METADATA
self._write_opf_metadata(root)
# MANIFEST
_ncx_id = self._write_opf_manifest(root)
# SPINE
self._write_opf_spine(root, _ncx_id)
# GUIDE
self._write_opf_guide(root)
# BINDINGS
self._write_opf_bindings(root)
# WRITE FILE
self._write_opf_file(root)
def _get_nav(self, item):
# just a basic navigation for now
nav_xml = parse_string(self.book.get_template("nav"))
root = nav_xml.getroot()
root.set("lang", self.book.language)
root.attrib["{%s}lang" % NAMESPACES["XML"]] = self.book.language # noqa
nav_dir_name = os.path.dirname(item.file_name)
head = etree.SubElement(root, "head")
title = etree.SubElement(head, "title")
title.text = item.title or self.book.title
# for now this just handles css files and ignores others
for _link in item.links:
_lnk = etree.SubElement(
head, "link", {"href": _link.get("href", ""), "rel": "stylesheet", "type": "text/css"}
)
body = etree.SubElement(root, "body")
if item.direction:
body.set("dir", item.direction)
nav = etree.SubElement(
body,
"nav",
{
"{%s}type" % NAMESPACES["EPUB"]: "toc", # noqa
"id": "id",
"role": "doc-toc",
},
)
content_title = etree.SubElement(nav, "h2")
content_title.text = item.title or self.book.title
def _create_section(itm, items):
ol = etree.SubElement(itm, "ol")
for item in items:
if isinstance(item, tuple) or isinstance(item, list):
li = etree.SubElement(ol, "li")
if isinstance(item[0], EpubHtml):
a = etree.SubElement(li, "a", {"href": zip_path.relpath(item[0].file_name, nav_dir_name)})
elif isinstance(item[0], Section) and item[0].href != "":
a = etree.SubElement(li, "a", {"href": zip_path.relpath(item[0].href, nav_dir_name)})
elif isinstance(item[0], Link):
a = etree.SubElement(li, "a", {"href": zip_path.relpath(item[0].href, nav_dir_name)})
else:
a = etree.SubElement(li, "span")
a.text = item[0].title
_create_section(li, item[1])
elif isinstance(item, Link):
li = etree.SubElement(ol, "li")
a = etree.SubElement(li, "a", {"href": zip_path.relpath(item.href, nav_dir_name)})
a.text = item.title
elif isinstance(item, EpubHtml):
li = etree.SubElement(ol, "li")
a = etree.SubElement(li, "a", {"href": zip_path.relpath(item.file_name, nav_dir_name)})
a.text = item.title
_create_section(nav, self.book.toc)
# LANDMARKS / GUIDE
# - http://www.idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def-types-landmarks
if len(self.book.guide) > 0 and self.options.get("epub3_landmark"):
# Epub2 guide types do not map completely to epub3 landmark types.
guide_to_landscape_map = {"notes": "rearnotes", "text": "bodymatter"}
guide_nav = etree.SubElement(body, "nav", {"{%s}type" % NAMESPACES["EPUB"]: "landmarks"}) # noqa: UP031
guide_content_title = etree.SubElement(guide_nav, "h2")
guide_content_title.text = self.options.get("landmark_title", "Guide")
guild_ol = etree.SubElement(guide_nav, "ol")
for elem in self.book.guide:
li_item = etree.SubElement(guild_ol, "li")
if "item" in elem:
chap = elem.get("item", None)
if chap:
_href = chap.file_name
_title = chap.title
else:
_href = elem.get("href", "")
_title = elem.get("title", "")
guide_type = elem.get("type", "")
a_item = etree.SubElement(
li_item,
"a",
{
"{%s}type" % NAMESPACES["EPUB"]: guide_to_landscape_map.get(guide_type, guide_type), # noqa: UP031
"href": zip_path.relpath(_href, nav_dir_name),
},
)
a_item.text = _title
# PAGE-LIST
if self.options.get("epub3_pages"):
inserted_pages = get_pages_for_items(
[item for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT) if not isinstance(item, EpubNav)]
)
if len(inserted_pages) > 0:
pagelist_nav = etree.SubElement(
body,
"nav",
{
"{%s}type" % NAMESPACES["EPUB"]: "page-list", # noqa: UP031
"id": "pages",
"hidden": "hidden",
},
)
pagelist_content_title = etree.SubElement(pagelist_nav, "h2")
pagelist_content_title.text = self.options.get("pages_title", "Pages")
pages_ol = etree.SubElement(pagelist_nav, "ol")
for filename, pageref, label in inserted_pages:
li_item = etree.SubElement(pages_ol, "li")
_href = "{filename}#{pageref}".format(filename=filename, pageref=pageref) # noqa: UP032
_title = label
a_item = etree.SubElement(
li_item,
"a",
{
"href": zip_path.relpath(_href, nav_dir_name),
},
)
a_item.text = _title
tree_str = etree.tostring(nav_xml, pretty_print=True, encoding="utf-8", xml_declaration=True)
return tree_str
def _get_ncx(self):
# we should be able to setup language for NCX as also
ncx = parse_string(self.book.get_template("ncx"))
root = ncx.getroot()
head = etree.SubElement(root, "head")
# get this id
_uid = etree.SubElement(head, "meta", {"content": self.book.uid, "name": "dtb:uid"})
_uid = etree.SubElement(head, "meta", {"content": "0", "name": "dtb:depth"})
_uid = etree.SubElement(head, "meta", {"content": "0", "name": "dtb:totalPageCount"})
_uid = etree.SubElement(head, "meta", {"content": "0", "name": "dtb:maxPageNumber"})
doc_title = etree.SubElement(root, "docTitle")
title = etree.SubElement(doc_title, "text")
title.text = self.book.title
# doc_author = etree.SubElement(root, 'docAuthor')
# author = etree.SubElement(doc_author, 'text')
# author.text = 'Name of the person'
# For now just make a very simple navMap
nav_map = etree.SubElement(root, "navMap")
def _add_play_order(nav_point):
nav_point.set("playOrder", str(self._play_order["start_from"]))
self._play_order["start_from"] += 1
def _create_section(itm, items, uid):
for item in items:
if isinstance(item, tuple) or isinstance(item, list):
section, subsection = item[0], item[1]
np = etree.SubElement(
itm,
"navPoint",
{
"id": section.get_id() if isinstance(section, EpubHtml) else "sep_{uid}".format(uid=uid) # noqa: UP032
},
)
if self._play_order["enabled"]:
_add_play_order(np)
nl = etree.SubElement(np, "navLabel")
nt = etree.SubElement(nl, "text")
nt.text = section.title
# CAN NOT HAVE EMPTY SRC HERE
href = ""
if isinstance(section, EpubHtml):
href = section.file_name
elif isinstance(section, Section) and section.href != "":
href = section.href
elif isinstance(section, Link):
href = section.href
_nc = etree.SubElement(np, "content", {"src": href})
uid = _create_section(np, subsection, uid + 1)
elif isinstance(item, Link):
_parent = itm
_content = _parent.find("content")
if _content is not None:
if _content.get("src") == "":
_content.set("src", item.href)
np = etree.SubElement(itm, "navPoint", {"id": item.uid})
if self._play_order["enabled"]:
_add_play_order(np)
nl = etree.SubElement(np, "navLabel")
nt = etree.SubElement(nl, "text")
nt.text = item.title
_nc = etree.SubElement(np, "content", {"src": item.href})
elif isinstance(item, EpubHtml):
_parent = itm
_content = _parent.find("content")
if _content is not None:
if _content.get("src") == "":
_content.set("src", item.file_name)
np = etree.SubElement(itm, "navPoint", {"id": item.get_id()})
if self._play_order["enabled"]:
_add_play_order(np)
nl = etree.SubElement(np, "navLabel")
nt = etree.SubElement(nl, "text")
nt.text = item.title
_nc = etree.SubElement(np, "content", {"src": item.file_name})
return uid
_create_section(nav_map, self.book.toc, 0)
tree_str = etree.tostring(root, pretty_print=True, encoding="utf-8", xml_declaration=True)
return tree_str
def _write_items(self):
for item in self.book.get_items():
if isinstance(item, EpubNcx):
self.out.writestr(
"{FOLDER_NAME}/{file_name}".format(FOLDER_NAME=self.book.FOLDER_NAME, file_name=item.file_name), # noqa: UP032
self._get_ncx(),
)
elif isinstance(item, EpubNav):
self.out.writestr(
"{FOLDER_NAME}/{file_name}".format(FOLDER_NAME=self.book.FOLDER_NAME, file_name=item.file_name), # noqa: UP032
self._get_nav(item),
)
elif item.manifest:
self.out.writestr(
"{FOLDER_NAME}/{file_name}".format(FOLDER_NAME=self.book.FOLDER_NAME, file_name=item.file_name), # noqa: UP032
item.get_content(),
)
else:
self.out.writestr("{file_name}".format(file_name=item.file_name), item.get_content()) # noqa: UP032
def write(self):
if six.PY2:
self.out = zipfile.ZipFile(self.file_name, "w", zipfile.ZIP_DEFLATED)
else:
self.out = zipfile.ZipFile(
self.file_name, "w", zipfile.ZIP_DEFLATED, compresslevel=self.options["compresslevel"]
)
self.out.writestr("mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED)
self._write_container()
self._write_opf()
self._write_items()
self.out.close()
class EpubReader(object): # noqa: UP004
DEFAULT_OPTIONS = {"ignore_ncx": True}
def __init__(self, epub_file_name, options=None):
self.file_name = epub_file_name
self.book = EpubBook()
self.zf = None
self.opf_file = ""
self.opf_dir = ""
self.options = dict(self.DEFAULT_OPTIONS)
if options:
self.options.update(options)
self._check_deprecated()
def _check_deprecated(self):
if self.options.get("ignore_ncx") is None:
warnings.warn("In the future version we will turn default option ignore_ncx to True.", stacklevel=2)
def process(self):
# should cache this html parsing so we don't do it for every plugin
for plg in self.options.get("plugins", []):
if hasattr(plg, "after_read"):
plg.after_read(self.book)
for item in self.book.get_items():
if isinstance(item, EpubHtml):
for plg in self.options.get("plugins", []):
if hasattr(plg, "html_after_read"):
plg.html_after_read(self.book, item)
def load(self):
self._load()
return self.book
def read_file(self, name):
# Raises KeyError
name = zip_path.normpath(name)
return self.zf.read(name)
def _load_container(self):
meta_inf = self.read_file("META-INF/container.xml")
tree = parse_string(meta_inf)
for root_file in tree.findall(
".//xmlns:rootfile[@media-type]", namespaces={"xmlns": NAMESPACES["CONTAINERNS"]}
):
if root_file.get("media-type") == "application/oebps-package+xml":
self.opf_file = root_file.get("full-path")
self.opf_dir = zip_path.dirname(self.opf_file)
def _load_metadata(self):
container_root = self.container.getroot()
# get epub version
self.book.version = container_root.get("version", None)
# get unique-identifier
if container_root.get("unique-identifier", None):
self.book.IDENTIFIER_ID = container_root.get("unique-identifier")
# get xml:lang
# get metadata
metadata = self.container.find("{%s}%s" % (NAMESPACES["OPF"], "metadata")) # noqa: UP031
nsmap = metadata.nsmap
nstags = dict(six.iteritems(nsmap))
default_ns = nstags.get(None, "")
nsdict = {v: {} for v in nsmap.values()}
def add_item(ns, tag, value, extra):
if ns not in nsdict:
nsdict[ns] = {}
values = nsdict[ns].setdefault(tag, [])
values.append((value, extra))
for t in metadata:
if not etree.iselement(t) or t.tag is etree.Comment:
continue
if t.tag == default_ns + "meta":
name = t.get("name")
others = dict(t.items())
if name and ":" in name:
prefix, name = name.split(":", 1)
else:
prefix = None
add_item(t.nsmap.get(prefix, prefix), name, t.text, others)
else:
tag = t.tag[t.tag.rfind("}") + 1 :]
if (t.prefix and t.prefix.lower() == "dc") and tag == "identifier":
_id = t.get("id", None)
if _id:
self.book.IDENTIFIER_ID = _id
others = dict(t.items())
add_item(t.nsmap[t.prefix], tag, t.text, others)
self.book.metadata = nsdict
titles = self.book.get_metadata("DC", "title")
if len(titles) > 0:
self.book.title = titles[0][0]
for value, others in self.book.get_metadata("DC", "identifier"):
if others.get("id") == self.book.IDENTIFIER_ID:
self.book.uid = value
def _load_manifest(self):
for r in self.container.find("{%s}%s" % (NAMESPACES["OPF"], "manifest")): # noqa: UP031
if r is not None and r.tag != "{%s}item" % NAMESPACES["OPF"]: # noqa: UP031
continue
media_type = r.get("media-type")
_properties = r.get("properties", "")
if _properties:
properties = _properties.split(" ")
else:
properties = []
# people use wrong content types
if media_type == "image/jpg":
media_type = "image/jpeg"
if media_type == "application/x-dtbncx+xml":
ei = EpubNcx(uid=r.get("id"), file_name=unquote(r.get("href")))
ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name))
elif media_type == "application/smil+xml":
ei = EpubSMIL(uid=r.get("id"), file_name=unquote(r.get("href")))
ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name))
elif media_type == "application/xhtml+xml":
if "nav" in properties:
ei = EpubNav(uid=r.get("id"), file_name=unquote(r.get("href")))
ei.content = self.read_file(zip_path.join(self.opf_dir, r.get("href")))
elif "cover" in properties:
ei = EpubCoverHtml()
ei.content = self.read_file(zip_path.join(self.opf_dir, unquote(r.get("href"))))
else:
ei = EpubHtml()
ei.id = r.get("id")
ei.file_name = unquote(r.get("href"))
ei.media_type = media_type
ei.media_overlay = r.get("media-overlay", None)
ei.media_duration = r.get("duration", None)
ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
ei.properties = properties
elif media_type in IMAGE_MEDIA_TYPES:
if "cover-image" in properties:
ei = EpubCover(uid=r.get("id"), file_name=unquote(r.get("href")))
ei.media_type = media_type
ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
else:
ei = EpubImage()
ei.id = r.get("id")
ei.file_name = unquote(r.get("href"))
ei.media_type = media_type
ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
else:
# different types
ei = EpubItem()
ei.id = r.get("id")
ei.file_name = unquote(r.get("href"))
ei.media_type = media_type
ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
self.book.add_item(ei)
def _parse_ncx(self, data):
tree = parse_string(data)
tree_root = tree.getroot()
nav_map = tree_root.find("{%s}navMap" % NAMESPACES["DAISY"]) # noqa: UP031
def _get_children(elems, n, nid):
label, content = "", ""
children = []
for a in elems.getchildren():
if a.tag == "{%s}navLabel" % NAMESPACES["DAISY"]: # noqa: UP031
label = a.getchildren()[0].text
if a.tag == "{%s}content" % NAMESPACES["DAISY"]: # noqa: UP031
content = a.get("src", "")
if a.tag == "{%s}navPoint" % NAMESPACES["DAISY"]: # noqa: UP031
children.append(_get_children(a, n + 1, a.get("id", "")))
if len(children) > 0:
if n == 0:
return children
return (Section(label, href=content), children)
else:
return Link(content, label, nid)
self.book.toc = _get_children(nav_map, 0, "")
def _parse_nav(self, data, base_path, navtype="toc"):
html_node = parse_html_string(data)
if navtype == "toc":
# parsing the table of contents
nav_node = html_node.xpath("//nav[@*='toc']")[0]
else:
# parsing the list of pages
_page_list = html_node.xpath("//nav[@*='page-list']")
if len(_page_list) == 0:
return
nav_node = _page_list[0]
def parse_list(list_node):
items = []
for item_node in list_node.findall("li"):
sublist_node = item_node.find("ol")
link_node = item_node.find("a")
if sublist_node is not None:
title = item_node[0].text_content()
children = parse_list(sublist_node)
if link_node is not None and link_node.get("href"):
href = zip_path.normpath(zip_path.join(base_path, link_node.get("href")))
items.append((Section(title, href=href), children))
else:
items.append((Section(title), children))
elif link_node is not None and link_node.get("href"):
title = link_node.text_content()
href = zip_path.normpath(zip_path.join(base_path, link_node.get("href")))
items.append(Link(href, title))
return items
if navtype == "toc":
self.book.toc = parse_list(nav_node.find("ol"))
elif nav_node is not None:
# generate the pages list if there is one
self.book.pages = parse_list(nav_node.find("ol"))
# generate the per-file pages lists
# because of the order of parsing the files, this can't be done
# when building the EpubHtml objects
htmlfiles = {}
for htmlfile in self.book.items:
if isinstance(htmlfile, EpubHtml):
htmlfiles[htmlfile.file_name] = htmlfile
for page in self.book.pages:
try:
(filename, idref) = page.href.split("#")
except ValueError:
filename = page.href
if filename in htmlfiles:
htmlfiles[filename].pages.append(page)
def _load_spine(self):
spine = self.container.find("{%s}%s" % (NAMESPACES["OPF"], "spine")) # noqa: UP031
self.book.spine = [(t.get("idref"), t.get("linear", "yes")) for t in spine]
toc = spine.get("toc", "")
self.book.set_direction(spine.get("page-progression-direction", None))
# should read ncx or nav file
nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None)
if toc:
if not self.options.get("ignore_ncx") or not nav_item:
try:
ncxFile = self.read_file(zip_path.join(self.opf_dir, self.book.get_item_with_id(toc).get_name()))
except KeyError:
raise EpubException(-1, "Can not find ncx file.") # noqa: B904
self._parse_ncx(ncxFile)
def _load_guide(self):
guide = self.container.find("{%s}%s" % (NAMESPACES["OPF"], "guide")) # noqa: UP031
if guide is not None:
self.book.guide = [{"href": t.get("href"), "title": t.get("title"), "type": t.get("type")} for t in guide]
def _load_opf_file(self):
try:
s = self.read_file(self.opf_file)
except KeyError:
raise EpubException(-1, "Can not find container file") # noqa: B904
self.container = parse_string(s)
self._load_metadata()
self._load_manifest()
self._load_spine()
self._load_guide()
# read nav file if found
#
nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None)
if nav_item:
if self.options.get("ignore_ncx") or not self.book.toc:
self._parse_nav(nav_item.content, zip_path.dirname(nav_item.file_name), navtype="toc")
self._parse_nav(nav_item.content, zip_path.dirname(nav_item.file_name), navtype="pages")
def _load(self):
self.zf = None
if (six.PY3 and isinstance(self.file_name, (str, bytes, os.PathLike))) or (
six.PY2 and isinstance(self.file_name, (six.string_types))
):
if os.path.isdir(self.file_name):
self.zf = Directory(self.file_name)
if self.zf is None:
try:
self.zf = zipfile.ZipFile(self.file_name, "r", compression=zipfile.ZIP_DEFLATED, allowZip64=True)
except zipfile.BadZipfile:
raise EpubException(0, "Bad Zip file") # noqa: B904
except zipfile.LargeZipFile:
raise EpubException(1, "Large Zip file") # noqa: B904
# 1st check metadata
self._load_container()
self._load_opf_file()
self.zf.close()
# WRITE
def write_epub(name, book, options=None):
"""
Creates epub file with the content defined in EpubBook.
>>> ebooklib.write_epub('book.epub', book)
:Args:
- name: file name for the output file
- book: instance of EpubBook
- options: extra opions as dictionary (optional)
"""
epub = EpubWriter(name, book, options)
epub.process()
try:
epub.write()
except OSError:
warnings.warn("In the future throwing exceptions while writing will be default behavior.", stacklevel=2)
t, v, tb = sys.exc_info()
if options and options.get("raise_exceptions"):
six.reraise(t, v, tb)
else:
return False
return True
# READ
def read_epub(name, options=None):
"""
Creates new instance of EpubBook with the content defined in the input file.
>>> book = ebooklib.read_epub('book.epub')
:Args:
- name: full path to the input file
- options: extra options as dictionary (optional)
:Returns:
Instance of EpubBook.
"""
reader = EpubReader(name, options)
book = reader.load()
reader.process()
return book
ebooklib-0.20/ebooklib/plugins/ 0000775 0000000 0000000 00000000000 15077504734 0016504 5 ustar 00root root 0000000 0000000 ebooklib-0.20/ebooklib/plugins/__init__.py 0000664 0000000 0000000 00000000000 15077504734 0020603 0 ustar 00root root 0000000 0000000 ebooklib-0.20/ebooklib/plugins/base.py 0000664 0000000 0000000 00000003031 15077504734 0017765 0 ustar 00root root 0000000 0000000 # This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib. If not, see .
class BasePlugin(object): # noqa: UP004
def before_write(self, book):
"Processing before save"
return True
def after_write(self, book):
"Processing after save"
return True
def before_read(self, book):
"Processing before save"
return True
def after_read(self, book):
"Processing after save"
return True
def item_after_read(self, book, item):
"Process general item after read."
return True
def item_before_write(self, book, item):
"Process general item before write."
return True
def html_after_read(self, book, chapter):
"Processing HTML before read."
return True
def html_before_write(self, book, chapter):
"Processing HTML before save."
return True
ebooklib-0.20/ebooklib/plugins/booktype.py 0000664 0000000 0000000 00000011056 15077504734 0020715 0 ustar 00root root 0000000 0000000 # This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib. If not, see .
from ebooklib.plugins.base import BasePlugin
from ebooklib.utils import parse_html_string
class BooktypeLinks(BasePlugin):
NAME = "Booktype Links"
def __init__(self, booktype_book):
self.booktype_book = booktype_book
def html_before_write(self, book, chapter):
try:
from urlparse import urljoin, urlparse
except ImportError:
from urllib.parse import urljoin, urlparse
from lxml import etree
try:
tree = parse_html_string(chapter.content)
except Exception:
return
root = tree.getroottree()
if len(root.find("body")) != 0:
body = tree.find("body")
# should also be aware to handle
# ../chapter/
# ../chapter/#reference
# ../chapter#reference
for _link in body.xpath("//a"):
# This is just temporary for the footnotes
if _link.get("href", "").find("InsertNoteID") != -1:
_ln = _link.get("href", "")
i = _ln.find("#")
_link.set("href", _ln[i:])
continue
_u = urlparse(_link.get("href", ""))
# Let us care only for internal links at the moment
if _u.scheme == "":
if _u.path != "":
_link.set("href", "{path}.xhtml".format(path=_u.path)) # noqa: UP032
if _u.fragment != "":
_link.set("href", urljoin(_link.get("href"), "#{fragment}".format(fragment=_u.fragment))) # noqa: UP032
if _link.get("name") is not None:
_link.set("id", _link.get("name"))
etree.strip_attributes(_link, "name")
chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8")
class BooktypeFootnotes(BasePlugin):
NAME = "Booktype Footnotes"
def __init__(self, booktype_book):
self.booktype_book = booktype_book
def html_before_write(self, book, chapter):
from lxml import etree
from ebooklib import epub
try:
tree = parse_html_string(chapter.content)
except Exception:
return
root = tree.getroottree()
if len(root.find("body")) != 0:
body = tree.find("body")
#
# 1
#
#
#
#
for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
footnote_id = footnote.get("id")[:-8]
a = footnote.getchildren()[0].getchildren()[0]
footnote_text = body.xpath('//li[@id="{footnote_id}"]'.format(footnote_id=footnote_id))[0] # noqa: UP032
a.attrib["{%s}type" % epub.NAMESPACES["EPUB"]] = "noteref" # noqa
ftn = etree.SubElement(body, "aside", {"id": footnote_id})
ftn.attrib["{%s}type" % epub.NAMESPACES["EPUB"]] = "footnote" # noqa
ftn_p = etree.SubElement(ftn, "p")
ftn_p.text = footnote_text.text
old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
if len(old_footnote) > 0:
body.remove(old_footnote[0])
chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8")
ebooklib-0.20/ebooklib/plugins/sourcecode.py 0000664 0000000 0000000 00000004635 15077504734 0021221 0 ustar 00root root 0000000 0000000 # This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib. If not, see .
from ebooklib.plugins.base import BasePlugin
from ebooklib.utils import parse_html_string
class SourceHighlighter(BasePlugin):
def __init__(self):
pass
def html_before_write(self, book, chapter):
from lxml import etree, html
from pygments import highlight
from pygments.formatters import HtmlFormatter
try:
tree = parse_html_string(chapter.content)
except Exception:
return
root = tree.getroottree()
had_source = False
if len(root.find("body")) != 0:
body = tree.find("body")
# check for embeded source
for source in body.xpath('//pre[contains(@class,"source-")]'):
css_class = source.get("class")
source_text = (source.text or "") + "".join([html.tostring(child) for child in source.iterchildren()])
if "source-python" in css_class:
from pygments.lexers import PythonLexer
# _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
_text = highlight(source_text, PythonLexer(), HtmlFormatter())
if "source-css" in css_class:
from pygments.lexers import CssLexer
_text = highlight(source_text, CssLexer(), HtmlFormatter())
_parent = source.getparent()
_parent.replace(source, etree.XML(_text))
had_source = True
if had_source:
chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css")
chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8")
ebooklib-0.20/ebooklib/plugins/standard.py 0000664 0000000 0000000 00000036002 15077504734 0020657 0 ustar 00root root 0000000 0000000 # This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib. If not, see .
import six
from ebooklib.plugins.base import BasePlugin
from ebooklib.utils import parse_html_string
# TODO:
# - should also look for the _required_ elements
# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element
ATTRIBUTES_GLOBAL = [
"accesskey",
"class",
"contenteditable",
"contextmenu",
"dir",
"draggable",
"dropzone",
"hidden",
"id",
"inert",
"itemid",
"itemprop",
"itemref",
"itemscope",
"itemtype",
"lang",
"spellcheck",
"style",
"tabindex",
"title",
"translate",
"epub:type",
]
# Remove for now from here
DEPRECATED_TAGS = [
"acronym",
"applet",
"basefont",
"big",
"center",
"dir",
"font",
"frame",
"frameset",
"isindex",
"noframes",
"s",
"strike",
"tt",
]
def leave_only(item, tag_list):
for _attr in six.iterkeys(item.attrib):
if _attr not in tag_list:
del item.attrib[_attr]
class SyntaxPlugin(BasePlugin):
NAME = "Check HTML syntax"
def html_before_write(self, book, chapter):
from lxml import etree
try:
tree = parse_html_string(chapter.content)
except Exception:
return
root = tree.getroottree()
# delete deprecated tags
# i should really have a list of allowed tags
for tag in DEPRECATED_TAGS:
etree.strip_tags(root, tag)
head = tree.find("head")
if head is not None and len(head) != 0:
for _item in head:
if _item.tag == "base":
leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target"])
elif _item.tag == "link":
leave_only(
_item, ATTRIBUTES_GLOBAL + ["href", "crossorigin", "rel", "media", "hreflang", "type", "sizes"]
)
elif _item.tag == "title":
if _item.text == "":
head.remove(_item)
elif _item.tag == "meta":
leave_only(_item, ATTRIBUTES_GLOBAL + ["name", "http-equiv", "content", "charset"])
# just remove for now, but really should not be like this
head.remove(_item)
elif _item.tag == "script":
leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "charset", "async", "defer", "crossorigin"])
elif _item.tag == "source":
leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "media"])
elif _item.tag == "style":
leave_only(_item, ATTRIBUTES_GLOBAL + ["media", "type", "scoped"])
else:
leave_only(_item, ATTRIBUTES_GLOBAL)
if len(root.find("body")) != 0:
body = tree.find("body")
for _item in body.iter():
# it is not
#
if _item.tag == "a":
leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target", "download", "rel", "hreflang", "type"])
elif _item.tag == "area":
leave_only(
_item,
ATTRIBUTES_GLOBAL
+ ["alt", "coords", "shape", "href", "target", "download", "rel", "hreflang", "type"],
)
elif _item.tag == "audio":
leave_only(
_item,
ATTRIBUTES_GLOBAL
+ ["src", "crossorigin", "preload", "autoplay", "mediagroup", "loop", "muted", "controls"],
)
elif _item.tag == "blockquote":
leave_only(_item, ATTRIBUTES_GLOBAL + ["cite"])
elif _item.tag == "button":
leave_only(
_item,
ATTRIBUTES_GLOBAL
+ [
"autofocus",
"disabled",
"form",
"formaction",
"formenctype",
"formmethod",
"formnovalidate",
"formtarget",
"name",
"type",
"value",
"menu",
],
)
elif _item.tag == "canvas":
leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"])
elif _item.tag == "canvas":
leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"])
elif _item.tag == "del":
leave_only(_item, ATTRIBUTES_GLOBAL + ["cite", "datetime"])
elif _item.tag == "details":
leave_only(_item, ATTRIBUTES_GLOBAL + ["open"])
elif _item.tag == "embed":
leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "width", "height"])
elif _item.tag == "fieldset":
leave_only(_item, ATTRIBUTES_GLOBAL + ["disable", "form", "name"])
elif _item.tag == "details":
leave_only(
_item,
ATTRIBUTES_GLOBAL
+ [
"accept-charset",
"action",
"autocomplete",
"enctype",
"method",
"name",
"novalidate",
"target",
],
)
elif _item.tag == "iframe":
leave_only(
_item,
ATTRIBUTES_GLOBAL
+ ["src", "srcdoc", "name", "sandbox", "seamless", "allowfullscreen", "width", "height"],
)
elif _item.tag == "img":
_src = _item.get("src", "").lower()
if _src.startswith("http://") or _src.startswith("https://"):
if "remote-resources" not in chapter.properties:
chapter.properties.append("remote-resources")
# THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
# THAT MEANS I SHOULD ALSO CATCH