pax_global_header00006660000000000000000000000064122040703150014504gustar00rootroot0000000000000052 comment=ccdeab23b68aeb195e46b5ae8aa031c59d21dc42 zbackup-1.2/000077500000000000000000000000001220407031500130055ustar00rootroot00000000000000zbackup-1.2/CMakeLists.txt000066400000000000000000000026641220407031500155550ustar00rootroot00000000000000# Copyright (c) 2012-2013 Konstantin Isakov # Part of ZBackup. Licensed under GNU GPLv2 or later cmake_minimum_required( VERSION 2.8.2 ) project( zbackup ) if( ${CMAKE_VERSION} VERSION_LESS "2.8.9" ) # Use the included FindLibLZMA then set( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" ) endif() set( CMAKE_BUILD_TYPE Release ) find_package( ZLIB REQUIRED ) include_directories( ${ZLIB_INCLUDE_DIRS} ) find_package( OpenSSL REQUIRED ) include_directories( ${OPENSSL_INCLUDE_DIR} ) find_package( Protobuf REQUIRED ) include_directories( ${PROTOBUF_INCLUDE_DIRS} ) include_directories( ${CMAKE_CURRENT_BINARY_DIR} ) find_program(PROTOBUF_PROTOC_CHECK NAMES protoc DOC "Protobuf compiler binary") IF(${PROTOBUF_PROTOC_CHECK} STREQUAL "PROTOBUF_PROTOC_CHECK-NOTFOUND") MESSAGE(FATAL_ERROR "Could not find protobuf compiler. Make sure protobuf-compiler package is installed.") ENDIF(${PROTOBUF_PROTOC_CHECK} STREQUAL "PROTOBUF_PROTOC_CHECK-NOTFOUND") PROTOBUF_GENERATE_CPP( protoSrcs protoHdrs zbackup.proto ) find_package( Threads REQUIRED ) find_package( LibLZMA REQUIRED ) include_directories( ${LIBLZMA_INCLUDE_DIRS} ) file( GLOB sourceFiles "*.cc" ) add_executable( zbackup ${sourceFiles} ${protoSrcs} ${protoHdrs} ) target_link_libraries( zbackup ${PROTOBUF_LIBRARIES} ${OPENSSL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${ZLIB_LIBRARIES} ${LIBLZMA_LIBRARIES} ) install( TARGETS zbackup DESTINATION bin ) zbackup-1.2/LICENSE000066400000000000000000000431641220407031500140220ustar00rootroot00000000000000GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. ZBackup, a versatile deduplicating backup tool Copyright (C) 2013 zbackup This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. {signature of Ty Coon}, 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. zbackup-1.2/README.md000066400000000000000000000437331220407031500142760ustar00rootroot00000000000000# Introduction **zbackup** is a globally-deduplicating backup tool, based on the ideas found in [rsync](http://rsync.samba.org/). Feed a large `.tar` into it, and it will store duplicate regions of it only once, then compress and optionally encrypt the result. Feed another `.tar` file, and it will also re-use any data found in any previous backups. This way only new changes are stored, and as long as the files are not very different, the amount of storage required is very low. Any of the backup files stored previously can be read back in full at any time. The program is format-agnostic, so you can feed virtually any files to it (any types of archives, proprietary formats, even raw disk images -- but see [Caveats](#caveats)). This is achieved by sliding a window with a rolling hash over the input at a byte granularity and checking whether the block in focus was ever met already. If a rolling hash matches, an additional full cryptographic hash is calculated to ensure the block is indeed the same. The deduplication happens then. # Features The program has the following features: * Parallel LZMA compression of the stored data * Built-in AES encryption of the stored data * Possibility to delete old backup data in the future * Use of a 64-bit rolling hash, keeping the amount of soft collisions to zero * Repository consists of immutable files. No existing files are ever modified * Written in C++ only with only modest library dependencies * Safe to use in production (see [below](#safety)) # Build dependencies * `cmake` >= 2.8.2 (though it should not be too hard to compile the sources by hand if needed) * `libssl-dev` for all encryption, hashing and random numbers * `libprotobuf-dev` and `protobuf-compiler` for data serialization * `liblzma-dev` for compression * `zlib1g-dev` for adler32 calculation # Quickstart To build: ```bash cd zbackup cmake . make sudo make install # or just run as ./zbackup ``` To use: ```bash zbackup init --non-encrypted /my/backup/repo tar c /my/precious/data | zbackup backup /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` zbackup restore /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` > /my/precious/backup-restored.tar ``` If you have a lot of RAM to spare, you can use it to speed-up the restore process -- to use 512 MB more, pass `--cache-size 512mb` when restoring. If encryption is wanted, create a file with your password: ``` bash # more secure to use an editor echo mypassword > ~/.my_backup_password chmod 600 ~/.my_backup_password ``` Then init the repo the following way: ```bash zbackup init --password-file ~/.my_backup_password /my/backup/repo ``` And always pass the same argument afterwards: ```bash tar c /my/precious/data | zbackup --password-file ~/.my_backup_password backup /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` zbackup --password-file ~/.my_backup_password restore /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` > /my/precious/backup-restored.tar ``` If you have a 32-bit system and a lot of cores, consider lowering the number of compression threads by passing `--threads 4` or `--threads 2` if the program runs out of address space when backing up (see why [below](#caveats), item 2). There should be no problem on a 64-bit system. # Caveats * While you can pipe any data into the program, the data should be uncompressed and unencrypted -- otherwise no deduplication could be performed on it. `zbackup` would compress and encrypt the data itself, so there's no need to do that yourself. So just run `tar c` and pipe it into `zbackup` directly. If backing up disk images employing encryption, pipe the unencrypted version (the one you normally mount). If you create `.zip` or `.rar` files, use no compression (`-0` or `-m0`) and no encryption. * Parallel LZMA compression uses a lot of RAM (several hundreds of megabytes, depending on the number of threads used), and ten times more virtual address space. The latter is only relevant on 32-bit architectures where it's limited to 2 or 3 GB. If you hit the ceiling, lower the number of threads with `--threads`. * Since the data is deduplicated, there's naturally no redundancy in it. A loss of a single file can lead to a loss of virtually all data. Make sure you store it on a redundant storage (RAID1, a cloud provider etc). * The encryption key, if used, is stored in the `info` file in the root of the repo. It is encrypted with your password. Technically thus you can change your password without re-encrypting any data, and as long as no one possesses the old `info` file and knows your old password, you would be safe (even though the actual option to change password is not implemented yet -- someone who needs this is welcome to create a pull request -- the possibility is all there). Also note that it is crucial you don't lose your `info` file, as otherwise the whole backup would be lost. # Limitations * Right now the only modes supported are reading from standard input and writing to standard output. FUSE mounts and NBD servers may be added later if someone contributes the code. * The program keeps all known blocks in an in-RAM hash table, which may create scalability problems for very large repos (see [below](#scalability)). * The only encryption mode currently implemented is `AES-128` in `CBC` mode with `PKCS#7` padding. If you believe that this is not secure enough, patches are welcome. Before you jump to conclusions however, read [this article](http://www.schneier.com/blog/archives/2009/07/another_new_aes.html). * The only compression mode supported is LZMA, which suits backups very nicely. * It's only possible to fully restore the backup in order to get to a required file, without any option to quickly pick it out. `tar` would not allow to do it anyway, but e.g. for `zip` files it could have been possible. This is possible to implement though, e.g. by exposing the data over a FUSE filesystem. * There's no option to delete old backup data yet. The possibility is all there, though. Someone needs to implement it (see [below](#improvements)). * There's no option to specify block and bundle sizes other than the default ones (currently `64k` and `2MB` respectively), though it's trivial to add command-line switches for those. Most of those limitations can be lifted by implementing the respective features. # Safety Is it safe to use `zbackup` for production data? Being free software, the program comes with no warranty of any kind. That said, it's perfectly safe for production, and here's why. When performing a backup, the program never modifies or deletes any existing files -- only new ones are created. It specifically checks for that, and the code paths involved are short and easy to inspect. Furthermore, each backup is protected by its `SHA256` sum, which is calculated before piping the data into the deduplication logic. The code path doing that is also short and easy to inspect. When a backup is being restored, its `SHA256` is calculated again and compared against the stored one. The program would fail on a mismatch. Therefore, to ensure safety it is enough to restore each backup to `/dev/null` immediately after creating it. If it restores fine, it will restore fine ever after. To add some statistics, the author of the program has been using an older version of `zbackup` internally for over a year. The `SHA256` check never ever failed. Again, even if it does, you would know immediately, so no work would be lost. Therefore you are welcome to try the program in production, and if you like it, stick with it. # Usage notes The repository has the following directory structure: ``` /repo backups/ bundles/ 00/ 01/ 02/ ... index/ info ``` * The `backups` directory contain your backups. Those are very small files which are needed for restoration. They are encrypted if encryption is enabled. The names can be arbitrary. It is possible to arrange files in subdirectories, too. Free renaming is also allowed. * The `bundles` directory contains the bulk of data. Each bundle internally contains multiple small chunks, compressed together and encrypted. Together all those chunks account for all deduplicated data stored. * The `index` directory contains the full index of all chunks in the repository, together with their bundle names. A separate index file is created for each backup session. Technically those files are redundant, all information is contained in the bundles themselves. However, having a separate `index` is nice for two reasons: 1) it's faster to read as it incurs less seeks, and 2) it allows making backups while storing bundles elsewhere. Bundles are only needed when restoring -- otherwise it's sufficient to only have `index`. One could then move all newly created bundles into another machine after each backup. * `info` is a very important file which contains all global repository metadata, such as chunk and bundle sizes, and an encryption key encrypted with the user password. It is paramount not to lose it, so backing it up separately somewhere might be a good idea. On the other hand, if you absolutely don't trust your remote storage provider, you might consider not storing it with the rest of the data. It would then be impossible to decrypt it at all, even if your password gets known later. The program does not have any facilities for sending your backup over the network. You can `rsync` the repo to another computer or use any kind of cloud storage capable of storing files. Since `zbackup` never modifies any existing files, the latter is especially easy -- just tell the upload tool you use not to upload any files which already exist on the remote side (e.g. with `gsutil` it's `gsutil cp -R -n /my/backup gs:/mybackup/`). To aid with creating backups, there's an utility called `tartool` included with `zbackup`. The idea is the following: one sprinkles empty files called `.backup` and `.no-backup` across the entire filesystem. Directories where `.backup` files are placed are marked for backing up. Similarly, directories with `.no-backup` files are marked not to be backed up. Additionally, it is possible to place `.backup-XYZ` in the same directory where `XYZ` is to mark `XYZ` for backing up, or place `.no-backup-XYZ` to mark it not to be backed up. Then `tartool` can be run with three arguments -- the root directory to start from (can be `/`), the output `includes` file, and the output `excludes` file. The tool traverses over the given directory noting the `.backup*` and `.no-backup*` files and creating include and exclude lists for the `tar` utility. The `tar` utility could then be run as `tar c --files-from includes --exclude-from excludes` to store all chosen data. # Scalability This section tries do address the question on the maximum amount of data which can be held in a backup repository. What is meant here is the deduplicated data. The number of bytes in all source files ever fed into the repository doesn't matter, but the total size of the resulting repository does. Internally all input data is split into small blocks called chunks (up to `64k` each by default). Chunks are collected into bundles (up to `2MB` each by default), and those bundles are then compressed and encrypted. There are then two problems with the total number of chunks in the repository: * Hashes of all existing chunks are needed to be kept in RAM while the backup is ongoing. Since the sliding window performs checking with a single-byte granularity, lookups would otherwise be too slow. The amount of data needed to be stored is technically only 24 bytes for each chunk, where the size of the chunk is up to `64k`. In an example real-life `18GB` repo, only `18MB` are taken by in its hash index. Multiply this roughly by two to have an estimate of RAM needed to store this index as an in-RAM hash table. However, as this size is proportional to the total size of the repo, for `2TB` repo you could already require `2GB` of RAM. Most repos are much smaller though, and as long as the deduplication works properly, in many cases you can store terabytes of highly-redundant backup files in a `20GB` repo easily. * We use a 64-bit rolling hash, which allows to have an `O(1)` lookup cost at each byte we process. Due to [birthday paradox](https://en.wikipedia.org/wiki/Birthday_paradox), we would start having collisions when we approach `2^32` hashes. If each chunk we have is `32k` on average, we would get there when our repo grows to `128TB`. We would still be able to continue, but as the number of collisions would grow, we would have to resort to calculating the full hash of a block at each byte more and more often, which would result in a considerable slowdown. All in all, as long as the amount of RAM permits, one can go up to several terabytes in deduplicated data, and start having some slowdown after having hundreds of terabytes, RAM-permitting. # Design choices * We use a 64-bit modified Rabin-Karp rolling hash (see `rolling_hash.hh` for details), while most other programs use a 32-bit one. As noted previously, one problem with the hash size is its birthday bound, which with the 32-bit hash is met after having only `2^16` hashes. The choice of a 64-bit hash allows us to scale much better while having virtually the same calculation cost on a typical 64-bit machine. * `rsync` uses `MD5` as its strong hash. While `MD5` is known to be fast, it is also known to be broken, allowing a malicious user to craft colliding inputs. `zbackup` uses `SHA1` instead. The cost of `SHA1` calculations on modern machines is actually less than that of `MD5` (run `openssl speed md5 sha1` on yours), so it's a win-win situation. We only keep the first 128 bits of the `SHA1` output, and therefore together with the rolling hash we have a 192-bit hash for each chunk. It's a multiple of 8 bytes which is a nice properly on 64-bit machines, and it is long enough not to worry about possible collisions. * `AES-128` in `CBC` mode with `PKCS#7` padding is used for encryption. This seems to be a reasonbly safe classic solution. Each encrypted file has a random IV as its first 16 bytes. * We use Google's [protocol buffers](https://developers.google.com/protocol-buffers/) to represent data structures in binary form. They are very efficient and relatively simple to use. # Improvements There's a lot to be improved in the program. It was released with the minimum amount of functionality to be useful. It is also stable. This should hopefully stimulate people to join the development and add all those other fancy features. Here's a list of ideas: * Additional options, such as configurable chunk and bundle sizes etc. * A command to change password. * A command to perform garbage collection. The program should skim through all backups and note which chunks are used by all of them. Then it should skim through all bundles and see which chunks among the ones stored were never used by the backups. If a bundle has more than *X%* of unused chunks, the remaining chunks should be transferred into brand new bundles. The old bundles should be deleted then. Once the process finishes, a new single index file with all existing chunk ids should be written, replacing all previous index files. With this command, it would become possible to remove old backups. * A command to fsck the repo by doing something close to what garbage collection does, but also checking all hashes and so on. * Parallel decompression. Right now decompression is single-threaded, but it is possible to look ahead in the stream and perform prefetching. * Support for mounting the repo over FUSE. Random access to data would then be possible. * Support for exposing a backed up file over a userspace NBD server. It would then be possible to mount raw disk images without extracting them. * Support for other encryption types (preferably for everything `openssl` supports with its `evp`). * Support for other compression methods. * You name it! # Communication * The program's website is at . * Development happens at . * Discussion forum is at . Please ask for help there! The author is reachable over email at . Please be constructive and don't ask for help using the program, though. In most cases it's best to stick to the forum, unless you have something to discuss with the author in private. # Similar projects `zbackup` is certainly not the first project to embrace the idea of using a rolling hash for deduplication. Here's a list of other projects the author found on the web: * [bup](https://github.com/bup/bup), based on storing data in `git` packs. No possibility of removing old data. This program was the initial inspiration for `zbackup`. * [ddar](http://www.synctus.com/ddar/), seems to be a little bit outdated. Contains a nice list of alternatives with comparisons. * [rdiff-backup](http://www.nongnu.org/rdiff-backup/), based on the original `rsync` algorithm. Does not do global deduplication, only working over the files with the same file name. * [duplicity](http://duplicity.nongnu.org/), which looks similar to `rdiff-backup` with regards to mode of operation. * Some filesystems (most notably [ZFS](http://en.wikipedia.org/wiki/ZFS) and [Btrfs](http://en.wikipedia.org/wiki/Btrfs)) provide deduplication features. They do so only at block level though, without a sliding window, so they can not accomodate to arbitrary byte insertion/deletion in the middle of data. # Credits Copyright (c) 2013-2013 Konstantin Isakov (). Licensed under GNU GPLv2 or later. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. zbackup-1.2/adler32.hh000066400000000000000000000014531220407031500145650ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ADLER32_HH_INCLUDED__ #define ADLER32_HH_INCLUDED__ #include #include #include /// A simple wrapper to calculate adler32 class Adler32 { public: typedef uint32_t Value; Adler32(): value( ( Value ) adler32( 0, 0, 0 ) ) {} void add( void const * data, size_t size ) { // When size is 0, we assume a no-op was requested and 'data' should be // ignored. However, adler32() has a special semantic for NULL 'data'. // Therefore we check the size before calling it if ( size ) value = ( Value ) adler32( value, ( Bytef const * ) data, size ); } Value result() const { return value; } private: Value value; }; #endif zbackup-1.2/appendallocator.cc000066400000000000000000000045351220407031500164730ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include "appendallocator.hh" AppendAllocator::AppendAllocator( unsigned blockSize_, unsigned granularity ): alignMask( granularity - 1 ), // We may decide to enlarge the block to make sure it is a multiple of // granularity. An improperly sized block would just waste the leftover // bytes blockSize( ( blockSize_ + alignMask ) & ~alignMask ), leftInBlock( -1 ) { } char * AppendAllocator::allocateBytes( unsigned size ) { // For zero-sized allocations, we always return a non-zero pointer. To do // that, we need to make sure we have it if ( !size && !blocks.empty() ) return nextAvailable; if ( leftInBlock < (int) size ) { unsigned toAllocate = ( size <= blockSize ? blockSize : size ); // Need a new block char * p = (char *) malloc( toAllocate ); if ( !p ) throw std::bad_alloc(); blocks.push_back( Record( p, nextAvailable, leftInBlock ) ); leftInBlock = (int) toAllocate; nextAvailable = p; } // We may need to allocate more than was asked to preserve granularity int toTake = (int) ( ( size + alignMask ) & ~alignMask ); char * result = nextAvailable; nextAvailable += toTake; leftInBlock -= toTake; // leftInBlock can become negative here, as toTake can // actually be larger than the space left due to an added alignment return result; } void AppendAllocator::returnBytes( unsigned size ) { if ( !size ) return; // If we are pointing to the start of the block, we need to free it and go // back to the previous one if ( nextAvailable == blocks.back().data ) { if ( blocks.size() == 1 ) throw std::bad_alloc(); free( blocks.back().data ); leftInBlock = blocks.back().prevLeftInBlock; nextAvailable = blocks.back().prevNextAvailable; blocks.pop_back(); } unsigned toTake = ( size + alignMask ) & ~alignMask; // There must be enough used bytes in the block if ( nextAvailable - blocks.back().data < (int) toTake ) throw std::bad_alloc(); nextAvailable -= toTake; leftInBlock += toTake; } void AppendAllocator::clear() { for ( unsigned x = blocks.size(); x--; ) free( blocks[ x ].data ); blocks.clear(); leftInBlock = -1; } AppendAllocator::~AppendAllocator() { clear(); } zbackup-1.2/appendallocator.hh000066400000000000000000000040141220407031500164750ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef APPENDALLOCATOR_HH_INCLUDED__ #define APPENDALLOCATOR_HH_INCLUDED__ #include #include #include #include /// A simple "add-only" memory allocation mechanism. class AppendAllocator { unsigned alignMask; unsigned blockSize; struct Record { char * data; char * prevNextAvailable; int prevLeftInBlock; Record( char * data_, char * prevNextAvailable_, int prevLeftInBlock_ ): data( data_ ), prevNextAvailable( prevNextAvailable_ ), prevLeftInBlock( prevLeftInBlock_ ) {} }; std::vector< Record > blocks; char * nextAvailable; int leftInBlock; // Can become < 0 due to added alignment public: /// blockSize is the amount of bytes allocated for each of the underlying /// storage blocks. granularity makes sure you allocate objects with /// the proper alignment. It must be a power of 2 AppendAllocator( unsigned blockSize, unsigned granularity ); ~AppendAllocator(); /// Removes all data from the append allocator. void clear(); /// Allocates a size-sized memory block. The only way to free it is to /// destroy the whole AppendAllocator. Can throw bad_alloc in an out-of- /// memory situation char * allocateBytes( unsigned size ); /// Returns the allocated bytes back. The size must match the size passed /// to allocateBytes() on the last invocation. Calls to allocateBytes()/ /// returnBytes() must follow the stack order - returnBytes() should undo /// the previous allocateBytes() void returnBytes( unsigned size ); /// Allocates memory to hold 'count' objects of T. In essense, it just does /// multiplication and type casting template< typename T > T * allocateObjects( unsigned count ) { return (T *) allocateBytes( count * sizeof( T ) ); } /// Returns the allocated objects back template< typename T > void returnObjects( unsigned count ) { returnBytes( count * sizeof( T ) ); } }; #endif zbackup-1.2/backup_creator.cc000066400000000000000000000163571220407031500163140ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include "backup_creator.hh" #include "check.hh" #include "debug.hh" #include "message.hh" #include "page_size.hh" #include "static_assert.hh" namespace { unsigned const MinChunkSize = 256; } BackupCreator::BackupCreator( StorageInfo const & info, ChunkIndex & chunkIndex, ChunkStorage::Writer & chunkStorageWriter ): chunkMaxSize( info.chunk_max_size() ), chunkIndex( chunkIndex ), chunkStorageWriter( chunkStorageWriter ), ringBufferFill( 0 ), chunkToSaveFill( 0 ), backupDataStream( new google::protobuf::io::StringOutputStream( &backupData ) ), chunkIdGenerated( false ) { // In our ring buffer we have enough space to store one chunk plus an extra // page for buffering the input ringBuffer.resize( chunkMaxSize + getPageSize() ); begin = ringBuffer.data(); end = &ringBuffer.back() + 1; head = begin; tail = head; chunkToSave.resize( chunkMaxSize ); } void * BackupCreator::getInputBuffer() { return head; } size_t BackupCreator::getInputBufferSize() { if ( tail > head ) return tail - head; else if ( tail == head && ringBufferFill ) return 0; else return end - head; } void BackupCreator::handleMoreData( unsigned added ) { // Note: head is never supposed to wrap around in the middle of the operation, // as getInputBufferSize() never returns a value which could result in a // wrap-around while( added ) { // If we don't have a full chunk, we need to consume data until we have // one if ( ringBufferFill < chunkMaxSize ) { unsigned left = chunkMaxSize - ringBufferFill; bool canFullyFill = added >= left; unsigned toFill = canFullyFill ? left : added; added -= toFill; ringBufferFill += toFill; while ( toFill-- ) rollingHash.rollIn( *head++ ); if ( head == end ) head = begin; // If we've managed to fill in the complete chunk, attempt matching it if ( canFullyFill ) addChunkIfMatched(); } else { // At this point we have a full chunk in the ring buffer, so we can rotate // over a byte chunkToSave[ chunkToSaveFill++ ] = *tail; if ( chunkToSaveFill == chunkMaxSize ) // Got the full chunk - save it saveChunkToSave(); rollingHash.rotate( *head++, *tail++ ); if ( head == end ) head = begin; if ( tail == end ) tail = begin; addChunkIfMatched(); --added; // A byte was consumed } } } void BackupCreator::saveChunkToSave() { CHECK( chunkToSaveFill > 0, "chunk to save is empty" ); if ( chunkToSaveFill < 128 ) // TODO: make this value configurable { // The amount of data is too small - emit without creating a new chunk BackupInstruction instr; instr.set_bytes_to_emit( chunkToSave.data(), chunkToSaveFill ); outputInstruction( instr ); } else { // Output as a chunk ChunkId id; id.rollingHash = RollingHash::digest( chunkToSave.data(), chunkToSaveFill ); unsigned char sha1Value[ SHA_DIGEST_LENGTH ]; SHA1( (unsigned char const *) chunkToSave.data(), chunkToSaveFill, sha1Value ); STATIC_ASSERT( sizeof( id.cryptoHash ) <= sizeof( sha1Value ) ); memcpy( id.cryptoHash, sha1Value, sizeof( id.cryptoHash ) ); // Save it to the store if it's not there already chunkStorageWriter.add( id, chunkToSave.data(), chunkToSaveFill ); BackupInstruction instr; instr.set_chunk_to_emit( id.toBlob() ); outputInstruction( instr ); } chunkToSaveFill = 0; } void BackupCreator::finish() { dPrintf( "At finish: %u, %u\n", chunkToSaveFill, ringBufferFill ); // At this point we may have some bytes in chunkToSave, and some in the ring // buffer. We need to save both if ( chunkToSaveFill + ringBufferFill > chunkMaxSize ) { // We have more than a full chunk in chunkToSave and ringBuffer together, so // save the first part as a full chunk first // Move data from ring buffer to have full chunk in chunkToSave. moveFromRingBufferToChunkToSave( chunkMaxSize - chunkToSaveFill ); saveChunkToSave(); } // Concatenate the rest of data and save it too CHECK( chunkToSaveFill + ringBufferFill <= chunkMaxSize, "had more than two " "full chunks at backup finish" ); moveFromRingBufferToChunkToSave( ringBufferFill ); if ( chunkToSaveFill ) saveChunkToSave(); } void BackupCreator::moveFromRingBufferToChunkToSave( unsigned toMove ) { // If tail is before head, all data in the ring buffer is in one contiguous // piece. If not, it's in two pieces if ( tail < head ) { memcpy( chunkToSave.data() + chunkToSaveFill, tail, toMove ); tail += toMove; } else { unsigned toEnd = end - tail; unsigned firstPart = toEnd < toMove ? toEnd : toMove; memcpy( chunkToSave.data() + chunkToSaveFill, tail, firstPart ); tail += firstPart; if ( toMove > firstPart ) { unsigned secondPart = toMove - firstPart; memcpy( chunkToSave.data() + chunkToSaveFill + firstPart, begin, secondPart ); tail = begin + secondPart; } } if ( tail == end ) tail = begin; chunkToSaveFill += toMove; ringBufferFill -= toMove; } ChunkId const & BackupCreator::getChunkId() { if ( !chunkIdGenerated ) { // Calculate SHA1 SHA_CTX ctx; SHA1_Init( &ctx ); if ( tail < head ) { // Tail is before head - all the block is in one contiguous piece SHA1_Update( &ctx, tail, head - tail ); } else { // Tail is after head - the block consists of two pieces SHA1_Update( &ctx, tail, end - tail ); SHA1_Update( &ctx, begin, head - begin ); } unsigned char sha1Value[ SHA_DIGEST_LENGTH ]; SHA1_Final( sha1Value, &ctx ); generatedChunkId.rollingHash = rollingHash.digest(); memcpy( generatedChunkId.cryptoHash, sha1Value, sizeof( generatedChunkId.cryptoHash ) ); chunkIdGenerated = true; } return generatedChunkId; } void BackupCreator::addChunkIfMatched() { chunkIdGenerated = false; if ( chunkIndex.findChunk( rollingHash.digest(), *this ) ) { // verbosePrintf( "Reuse of chunk %lu\n", rollingHash.digest() ); // Before emitting the matched chunk, we need to make sure any bytes // which came before it are saved first if ( chunkToSaveFill ) saveChunkToSave(); // Add the record BackupInstruction instr; instr.set_chunk_to_emit( getChunkId().toBlob() ); outputInstruction( instr ); // The block was consumed from the ring buffer - remove the block from it tail = head; ringBufferFill = 0; rollingHash.reset(); } } void BackupCreator::outputInstruction( BackupInstruction const & instr ) { // TODO: once backupData becomes large enough, spawn another BackupCreator and // feed data to it. This way we wouldn't have to store the entire backupData // in RAM Message::serialize( instr, *backupDataStream ); } void BackupCreator::getBackupData( string & str ) { CHECK( backupDataStream.get(), "getBackupData() called twice" ); backupDataStream.reset(); str.swap( backupData ); } zbackup-1.2/backup_creator.hh000066400000000000000000000055401220407031500163160ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef BACKUP_CREATOR_HH_INCLUDED__ #define BACKUP_CREATOR_HH_INCLUDED__ #include #include #include #include #include "chunk_id.hh" #include "chunk_index.hh" #include "chunk_storage.hh" #include "file.hh" #include "nocopy.hh" #include "rolling_hash.hh" #include "sptr.hh" #include "zbackup.pb.h" using std::vector; using std::string; /// Creates a backup by processing input data and matching/writing chunks class BackupCreator: ChunkIndex::ChunkInfoInterface, NoCopy { unsigned chunkMaxSize; ChunkIndex & chunkIndex; ChunkStorage::Writer & chunkStorageWriter; vector< char > ringBuffer; // Ring buffer vars char * begin; char * end; char * head; char * tail; unsigned ringBufferFill; /// In this buffer we assemble the next chunk to be eventually stored. We /// copy the bytes from the ring buffer. While the copying may be avoided in /// some cases, the plan is to move to multi-threaded chunk storage in the /// future, where it would be necessary in any case vector< char > chunkToSave; unsigned chunkToSaveFill; /// Number of bytes accumulated in chunkToSave /// When we have data in chunkToSave, this points to the record in backupData /// which should store it unsigned recordIndexToSaveDataInto; RollingHash rollingHash; string backupData; sptr< google::protobuf::io::StringOutputStream > backupDataStream; /// Sees if the current block in the ring buffer exists in the chunk store. /// If it does, the reference is emitted and the ring buffer is cleared void addChunkIfMatched(); /// Outputs data contained in chunkToSave as a new chunk void saveChunkToSave(); /// Move the given amount of bytes from the ring buffer to the chunk to save. /// Ring buffer must have at least that many bytes void moveFromRingBufferToChunkToSave( unsigned bytes ); /// Outputs the given instruction to the backup stream void outputInstruction( BackupInstruction const & ); bool chunkIdGenerated; ChunkId generatedChunkId; virtual ChunkId const & getChunkId(); public: BackupCreator( StorageInfo const &, ChunkIndex &, ChunkStorage::Writer & ); /// The data is fed the following way: the user fills getInputBuffer() with /// up to getInputBufferSize() bytes, then calls handleMoreData() with the /// number of bytes written void * getInputBuffer(); size_t getInputBufferSize(); void handleMoreData( unsigned ); /// Flushes any remaining data and finishes the process. No additional data /// may be added after this call is made void finish(); /// Returns the result of the backup creation. Can only be called once the /// finish() was called and the backup is complete void getBackupData( string & ); }; #endif zbackup-1.2/backup_file.cc000066400000000000000000000022151220407031500155600ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "backup_file.hh" #include "encrypted_file.hh" #include "encryption.hh" #include "message.hh" namespace BackupFile { enum { FileFormatVersion = 1 }; void save( string const & fileName, EncryptionKey const & encryptionKey, BackupInfo const & backupInfo ) { EncryptedFile::OutputStream os( fileName.c_str(), encryptionKey, Encryption::ZeroIv ); os.writeRandomIv(); FileHeader header; header.set_version( FileFormatVersion ); Message::serialize( header, os ); Message::serialize( backupInfo, os ); os.writeAdler32(); } void load( string const & fileName, EncryptionKey const & encryptionKey, BackupInfo & backupInfo ) { EncryptedFile::InputStream is( fileName.c_str(), encryptionKey, Encryption::ZeroIv ); is.consumeRandomIv(); FileHeader header; Message::parse( header, is ); if ( header.version() != FileFormatVersion ) throw exUnsupportedVersion(); Message::parse( backupInfo, is ); is.checkAdler32(); } } zbackup-1.2/backup_file.hh000066400000000000000000000013701220407031500155730ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef BACKUP_FILE_HH_INCLUDED__ #define BACKUP_FILE_HH_INCLUDED__ #include #include #include "encryption_key.hh" #include "ex.hh" #include "zbackup.pb.h" namespace BackupFile { using std::string; DEF_EX( Ex, "Backup file exception", std::exception ) DEF_EX( exUnsupportedVersion, "Unsupported version of the backup file format", Ex ) /// Saves the given BackupInfo data into the given file void save( string const & fileName, EncryptionKey const &, BackupInfo const & ); /// Loads the given BackupInfo data from the given file void load( string const & fileName, EncryptionKey const &, BackupInfo & ); } #endif zbackup-1.2/backup_restorer.cc000066400000000000000000000034571220407031500165170ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include "backup_restorer.hh" #include "chunk_id.hh" #include "message.hh" #include "zbackup.pb.h" namespace BackupRestorer { using std::vector; using google::protobuf::io::CodedInputStream; void restore( ChunkStorage::Reader & chunkStorageReader, std::string const & backupData, DataSink & output ) { google::protobuf::io::ArrayInputStream is( backupData.data(), backupData.size() ); CodedInputStream cis( &is ); CodedInputStream::Limit limit = cis.PushLimit( backupData.size() ); // The following line prevents it from barfing on large backupData. // TODO: this disables size checks for each separate message. Figure a better // way to do this while keeping them enabled. It seems we need to create an // instance of CodedInputStream for each message, but it might be expensive cis.SetTotalBytesLimit( backupData.size(), -1 ); // Used when emitting chunks string chunk; BackupInstruction instr; while ( cis.BytesUntilLimit() > 0 ) { Message::parse( instr, cis ); if ( instr.has_chunk_to_emit() ) { // Need to emit a chunk, reading it from the store size_t chunkSize; chunkStorageReader.get( ChunkId( instr.chunk_to_emit() ), chunk, chunkSize ); output.saveData( chunk.data(), chunkSize ); } if ( instr.has_bytes_to_emit() ) { // Need to emit the bytes directly string const & bytes = instr.bytes_to_emit(); output.saveData( bytes.data(), bytes.size() ); } } cis.PopLimit( limit ); } } zbackup-1.2/backup_restorer.hh000066400000000000000000000014241220407031500165210ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef BACKUP_RESTORER_HH_INCLUDED__ #define BACKUP_RESTORER_HH_INCLUDED__ #include #include #include #include "chunk_storage.hh" #include "ex.hh" /// Generic interface to stream data out class DataSink { public: virtual void saveData( void const * data, size_t size )=0; virtual ~DataSink() {} }; /// Restores the backup namespace BackupRestorer { DEF_EX( Ex, "Backup restorer exception", std::exception ) DEF_EX( exTooManyBytesToEmit, "A backup record asks to emit too many bytes", Ex ) /// Restores the given backup void restore( ChunkStorage::Reader &, std::string const & backupData, DataSink & ); } #endif zbackup-1.2/bundle.cc000066400000000000000000000115461220407031500145740ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include "bundle.hh" #include "check.hh" #include "dir.hh" #include "encrypted_file.hh" #include "encryption.hh" #include "hex.hh" #include "message.hh" namespace Bundle { enum { FileFormatVersion = 1 }; void Creator::addChunk( string const & id, void const * data, size_t size ) { BundleInfo_ChunkRecord * record = info.add_chunk_record(); record->set_id( id ); record->set_size( size ); payload.append( ( char const * ) data, size ); } void Creator::write( std::string const & fileName, EncryptionKey const & key ) { EncryptedFile::OutputStream os( fileName.c_str(), key, Encryption::ZeroIv ); os.writeRandomIv(); FileHeader header; header.set_version( FileFormatVersion ); Message::serialize( header, os ); Message::serialize( info, os ); os.writeAdler32(); // Compress uint32_t preset = 6; // TODO: make this customizable, although 6 seems to be // the best option lzma_stream strm = LZMA_STREAM_INIT; lzma_ret ret; ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 ); CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret ); strm.next_in = ( uint8_t const * ) payload.data(); strm.avail_in = payload.size(); for ( ; ; ) { { void * data; int size; if ( !os.Next( &data, &size ) ) { lzma_end( &strm ); throw exBundleWriteFailed(); } if ( !size ) continue; strm.next_out = ( uint8_t * ) data; strm.avail_out = size; } // Perform the compression ret = lzma_code( &strm, LZMA_FINISH ); if ( ret == LZMA_STREAM_END ) { if ( strm.avail_out ) os.BackUp( strm.avail_out ); break; } CHECK( ret == LZMA_OK, "lzma_code error: %d", (int) ret ); } lzma_end( &strm ); os.writeAdler32(); } Reader::Reader( string const & fileName, EncryptionKey const & key ) { EncryptedFile::InputStream is( fileName.c_str(), key, Encryption::ZeroIv ); is.consumeRandomIv(); FileHeader header; Message::parse( header, is ); if ( header.version() != FileFormatVersion ) throw exUnsupportedVersion(); BundleInfo info; Message::parse( info, is ); is.checkAdler32(); size_t payloadSize = 0; for ( int x = info.chunk_record_size(); x--; ) payloadSize += info.chunk_record( x ).size(); payload.resize( payloadSize ); lzma_stream strm = LZMA_STREAM_INIT; lzma_ret ret; ret = lzma_stream_decoder( &strm, UINT64_MAX, 0 ); CHECK( ret == LZMA_OK,"lzma_stream_decoder error: %d", (int) ret ); strm.next_out = ( uint8_t * ) &payload[ 0 ]; strm.avail_out = payload.size(); for ( ; ; ) { { void const * data; int size; if ( !is.Next( &data, &size ) ) { lzma_end( &strm ); throw exBundleReadFailed(); } if ( !size ) continue; strm.next_in = ( uint8_t const * ) data; strm.avail_in = size; } ret = lzma_code( &strm, LZMA_RUN ); if ( ret == LZMA_STREAM_END ) { if ( strm.avail_in ) is.BackUp( strm.avail_in ); break; } CHECK( ret == LZMA_OK, "lzma_code error: %d", (int) ret ); if ( !strm.avail_out && strm.avail_in ) { // Apparently we have more data than we were expecting lzma_end( &strm ); throw exTooMuchData(); } } lzma_end( &strm ); is.checkAdler32(); // Populate the map char const * next = payload.data(); for ( int x = 0, count = info.chunk_record_size(); x < count; ++x ) { BundleInfo_ChunkRecord const & record = info.chunk_record( x ); pair< Chunks::iterator, bool > res = chunks.insert( Chunks::value_type( record.id(), Chunks::mapped_type( next, record.size() ) ) ); if ( !res.second ) throw exDuplicateChunks(); // Duplicate key encountered next += record.size(); } } bool Reader::get( string const & chunkId, string & chunkData, size_t & chunkDataSize ) { Chunks::iterator i = chunks.find( chunkId ); if ( i != chunks.end() ) { size_t sz = i->second.second; if ( chunkData.size() < sz ) chunkData.resize( sz ); memcpy( &chunkData[ 0 ], i->second.first, sz ); chunkDataSize = sz; return true; } else return false; } string generateFileName( Id const & id, string const & bundlesDir, bool createDirs ) { string hex( toHex( ( unsigned char * ) &id, sizeof( id ) ) ); // TODO: make this scheme more flexible and allow it to scale, or at least // be configurable string level1( Dir::addPath( bundlesDir, hex.substr( 0, 2 ) ) ); if ( createDirs && !Dir::exists( level1 ) ) Dir::create( level1 ); return string( Dir::addPath( level1, hex ) ); } } zbackup-1.2/bundle.hh000066400000000000000000000062041220407031500146010ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef BUNDLE_HH_INCLUDED__ #define BUNDLE_HH_INCLUDED__ #include #include #include #include #include #include #include "encryption_key.hh" #include "ex.hh" #include "nocopy.hh" #include "static_assert.hh" #include "zbackup.pb.h" namespace Bundle { using std::string; using std::pair; using std::map; enum { /// The number of bytes the bundle id has. We chose 192-bit just to be on /// the safer side. It is also a multiple of 8 bytes, which is good for /// alignment IdSize = 24 }; /// Id of the bundle is IdSize bytes. Can and should be used as a POD type struct Id { char blob[ IdSize ]; bool operator == ( Id const & other ) const { return memcmp( blob, other.blob, sizeof( blob ) ) == 0; } bool operator != ( Id const & other ) const { return ! operator == ( other ); } }; STATIC_ASSERT( sizeof( Id ) == IdSize ); /// Creates a bundle by adding chunks to it until it's full, then compressing /// it and writing out to disk class Creator { BundleInfo info; string payload; public: DEF_EX( Ex, "Bundle creator exception", std::exception ) DEF_EX( exBundleWriteFailed, "Bundle write failed", Ex ) /// Adds a chunk with the given id void addChunk( string const & chunkId, void const * data, size_t size ); /// Returns the number of bytes comprising all chunk bodies so far size_t getPayloadSize() const { return payload.size(); } /// Compresses and writes the bundle to the given file. The operation is /// time-consuming - calling this function from a worker thread could be /// warranted void write( string const & fileName, EncryptionKey const & ); /// Returns the current BundleInfo record - this is used for index files BundleInfo const & getCurrentBundleInfo() const { return info; } }; /// Reads the bundle and allows accessing chunks class Reader: NoCopy { /// Unpacked payload string payload; /// Maps chunk id blob to its contents and size typedef map< string, pair< char const *, size_t > > Chunks; Chunks chunks; public: DEF_EX( Ex, "Bundle reader exception", std::exception ) DEF_EX( exBundleReadFailed, "Bundle read failed", Ex ) DEF_EX( exUnsupportedVersion, "Unsupported version of the index file format", Ex ) DEF_EX( exTooMuchData, "More data than expected in a bundle", Ex ) DEF_EX( exDuplicateChunks, "Chunks with the same id found in a bundle", Ex ) Reader( string const & fileName, EncryptionKey const & ); /// Reads the chunk into chunkData and returns true, or returns false if there /// was no such chunk in the bundle. chunkData may be enlarged but won't /// be shrunk. The size of the actual chunk would be stored in chunkDataSize bool get( string const & chunkId, string & chunkData, size_t & chunkDataSize ); }; /// Generates a full file name for a bundle with the given id. If createDirs /// is true, any intermediate directories will be created if they don't exist /// already string generateFileName( Id const &, string const & bundlesDir, bool createDirs ); } #endif zbackup-1.2/check.hh000066400000000000000000000016401220407031500144040ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef CHECK_HH_INCLUDED__ #define CHECK_HH_INCLUDED__ #include #include #include // Run-time assertion macro // Usage: CHECK( value == 16, "Value is not 16: %d", value ); // This will abort() if the value is not 16 with the message stating so. // TODO: show the backtrace here, without using __FILE__ __LINE__ #define CHECK( condition, message, ... ) ({if (!(condition)) \ { \ fprintf( stderr, "Check failed: " ); \ fprintf( stderr, message, ##__VA_ARGS__ ); \ fprintf( stderr, "\nAt %s:%d\n", __FILE__, __LINE__ ); \ abort(); \ }}) #define FAIL( ... ) CHECK( false, __VA_ARGS__ ) // Debug-only versions. Only instantiated in debug builds #ifndef NDEBUG #define DCHECK CHECK #define DFAIL FAIL #else #define DCHECK( ... ) #define DFAIL( ... ) #endif #endif zbackup-1.2/chunk_id.cc000066400000000000000000000017301220407031500151010ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "chunk_id.hh" #include #include "endian.hh" #include "check.hh" string ChunkId::toBlob() const { string out( BlobSize, 0 ); toBlob( &out[ 0 ] ); return out; } void ChunkId::toBlob( void * outPtr ) const { char * out = ( char * ) outPtr; RollingHash::Digest v = toLittleEndian( rollingHash ); memcpy( out, cryptoHash, sizeof( cryptoHash ) ); memcpy( out + sizeof( cryptoHash ), &v, sizeof( v ) ); } void ChunkId::setFromBlob( void const * data ) { char const * blob = ( char const * ) data; RollingHash::Digest v; memcpy( cryptoHash, blob, sizeof( cryptoHash ) ); memcpy( &v, blob + sizeof( cryptoHash ), sizeof( v ) ); rollingHash = fromLittleEndian( v ); } ChunkId::ChunkId( string const & blob ) { CHECK( blob.size() == BlobSize, "incorrect blob sise: %zu", blob.size() ); setFromBlob( blob.data() ); } zbackup-1.2/chunk_id.hh000066400000000000000000000015351220407031500151160ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef CHUNK_ID_HH_INCLUDED__ #define CHUNK_ID_HH_INCLUDED__ #include #include "rolling_hash.hh" using std::string; /// Chunk is identified by its crypto hash concatenated with its rolling hash struct ChunkId { typedef char CryptoHashPart[ 16 ]; CryptoHashPart cryptoHash; typedef RollingHash::Digest RollingHashPart; RollingHashPart rollingHash; enum { BlobSize = sizeof( CryptoHashPart ) + sizeof( RollingHashPart ) }; string toBlob() const; /// Faster version - should point to a buffer with at least BlobSize bytes void toBlob( void * ) const; /// Set the chunk id data reading from the given blob void setFromBlob( void const * ); ChunkId() {} ChunkId( string const & blob ); }; #endif zbackup-1.2/chunk_index.cc000066400000000000000000000074321220407031500156210ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include "chunk_index.hh" #include "debug.hh" #include "dir.hh" #include "index_file.hh" #include "zbackup.pb.h" ChunkIndex::Chain::Chain( ChunkId const & id, Bundle::Id const * bundleId ): next( 0 ), bundleId( bundleId ) { memcpy( cryptoHash, id.cryptoHash, sizeof( cryptoHash ) ); } bool ChunkIndex::Chain::equalsTo( ChunkId const & id ) { return memcmp( cryptoHash, id.cryptoHash, sizeof ( cryptoHash ) ) == 0; } void ChunkIndex::loadIndex() { Dir::Listing lst( indexPath ); Dir::Entry entry; verbosePrintf( "Loading index...\n" ); while( lst.getNext( entry ) ) { verbosePrintf( "Loading index file %s...\n", entry.getFileName().c_str() ); IndexFile::Reader reader( key, Dir::addPath( indexPath, entry.getFileName() ) ); BundleInfo info; Bundle::Id bundleId; while( reader.readNextRecord( info, bundleId ) ) { Bundle::Id * savedId = storage.allocateObjects< Bundle::Id >( 1 ); memcpy( savedId, &bundleId, sizeof( bundleId ) ); lastBundleId = savedId; ChunkId id; for ( int x = info.chunk_record_size(); x--; ) { BundleInfo_ChunkRecord const & record = info.chunk_record( x ); if ( record.id().size() != ChunkId::BlobSize ) throw exIncorrectChunkIdSize(); id.setFromBlob( record.id().data() ); registerNewChunkId( id, savedId ); } } } verbosePrintf( "Index loaded.\n" ); } ChunkIndex::ChunkIndex( EncryptionKey const & key, TmpMgr & tmpMgr, string const & indexPath ): key( key ), tmpMgr( tmpMgr ), indexPath( indexPath ), storage( 65536, 1 ), lastBundleId( NULL ) { loadIndex(); } Bundle::Id const * ChunkIndex::findChunk( ChunkId::RollingHashPart rollingHash, ChunkInfoInterface & chunkInfo ) { HashTable::iterator i = hashTable.find( rollingHash ); ChunkId const * id = 0; if ( i != hashTable.end() ) { if ( !id ) id = &chunkInfo.getChunkId(); // Check the chains for ( Chain * chain = i->second; chain; chain = chain->next ) if ( chain->equalsTo( *id ) ) return chain->bundleId; } return NULL; } namespace { struct ChunkInfoImmediate: public ChunkIndex::ChunkInfoInterface { ChunkId const & id; ChunkInfoImmediate( ChunkId const & id ): id( id ) {} virtual ChunkId const & getChunkId() { return id; } }; } Bundle::Id const * ChunkIndex::findChunk( ChunkId const & chunkId ) { ChunkInfoImmediate chunkInfo( chunkId ); return findChunk( chunkId.rollingHash, chunkInfo ); } ChunkIndex::Chain * ChunkIndex::registerNewChunkId( ChunkId const & id, Bundle::Id const * bundleId ) { HashTable::iterator i = hashTable.insert( std::make_pair( id.rollingHash, ( Chain *) 0 ) ).first; Chain ** chain = &i->second; // Check the chains for ( ; *chain; chain = &( ( *chain )->next ) ) if ( ( *chain )->equalsTo( id ) ) { return NULL; // The entry existed already } // Create a new chain *chain = new ( storage.allocateObjects< Chain >( 1 ) ) Chain( id, bundleId ); return *chain; } bool ChunkIndex::addChunk( ChunkId const & id, Bundle::Id const & bundleId ) { if ( Chain * chain = registerNewChunkId( id, NULL ) ) { // Allocate or re-use bundle id if ( !lastBundleId || *lastBundleId != bundleId ) { Bundle::Id * allocatedId = storage.allocateObjects< Bundle::Id >( 1 ); memcpy( allocatedId, &bundleId, Bundle::IdSize ); lastBundleId = allocatedId; } chain->bundleId = lastBundleId; return true; } else return false; } zbackup-1.2/chunk_index.hh000066400000000000000000000062541220407031500156340ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef CHUNK_INDEX_HH_INCLUDED__ #define CHUNK_INDEX_HH_INCLUDED__ // is obsolete, but requires C++11. Make up your // mind, GNU people! #undef __DEPRECATED #include #include #include #include #include #include #include "appendallocator.hh" #include "bundle.hh" #include "chunk_id.hh" #include "dir.hh" #include "encryption_key.hh" #include "endian.hh" #include "ex.hh" #include "index_file.hh" #include "nocopy.hh" #include "rolling_hash.hh" #include "tmp_mgr.hh" using std::vector; /// __gnu_cxx::hash is not defined for unsigned long long. As uint64_t is /// typedefed as unsigned long long on all 32-bit architectures and on some /// 64-bit ones, we need to define this. Our keys should have more or less /// uniform bit distribution, so on 32-bit systems returning the lower 32 bits /// should be fine namespace __gnu_cxx { template<> struct hash< unsigned long long > { size_t operator()( unsigned long long v ) const { return v; } }; } /// Maintains an in-memory hash table allowing to check whether we have a /// specific chunk or not, and if we do, get the bundle id it's in class ChunkIndex: NoCopy { struct Chain { ChunkId::CryptoHashPart cryptoHash; Chain * next; Bundle::Id const * bundleId; Chain( ChunkId const &, Bundle::Id const * bundleId ); bool equalsTo( ChunkId const & id ); }; /// This hash map stores all known chunk ids /// TODO: implement a custom hash table for better performance typedef __gnu_cxx::hash_map< RollingHash::Digest, Chain * > HashTable; EncryptionKey const & key; TmpMgr & tmpMgr; string indexPath; AppendAllocator storage; HashTable hashTable; /// Stores the last used bundle id, which can be re-used Bundle::Id const * lastBundleId; public: DEF_EX( Ex, "Chunk index exception", std::exception ) DEF_EX( exIncorrectChunkIdSize, "Incorrect chunk id size encountered", Ex ) ChunkIndex( EncryptionKey const &, TmpMgr &, string const & indexPath ); struct ChunkInfoInterface { /// Returns the full id of the chunk. This function is only called if that /// full id is actually needed, as its generation requires the expensive /// calculation of the full hash virtual ChunkId const & getChunkId()=0; virtual ~ChunkInfoInterface() {} }; /// If the given chunk exists, its bundle id is returned, otherwise NULL Bundle::Id const * findChunk( ChunkId::RollingHashPart, ChunkInfoInterface & ); /// If the given chunk exists, its bundle id is returned, otherwise NULL Bundle::Id const * findChunk( ChunkId const & ); /// Adds a new chunk to the index if it did not exist already. Returns true /// if added, false if existed already bool addChunk( ChunkId const &, Bundle::Id const & ); private: void loadIndex(); /// Inserts new chunk id into the in-memory hash table. Returns the created /// Chain if it was inserted, NULL if it existed before Chain * registerNewChunkId( ChunkId const & id, Bundle::Id const * ); }; #endif zbackup-1.2/chunk_storage.cc000066400000000000000000000140711220407031500161530ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "check.hh" #include "chunk_storage.hh" #include "debug.hh" #include "dir.hh" #include "hex.hh" #include "random.hh" namespace ChunkStorage { Writer::Writer( StorageInfo const & storageInfo, EncryptionKey const & encryptionKey, TmpMgr & tmpMgr, ChunkIndex & index, string const & bundlesDir, string const & indexDir, size_t maxCompressorsToRun ): storageInfo( storageInfo ), encryptionKey( encryptionKey ), tmpMgr( tmpMgr ), index( index ), bundlesDir( bundlesDir ), indexDir( indexDir ), hasCurrentBundleId( false ), maxCompressorsToRun( maxCompressorsToRun ), runningCompressors( 0 ) { verbosePrintf( "Using up to %zu thread(s) for compression\n", maxCompressorsToRun ); } Writer::~Writer() { waitForAllCompressorsToFinish(); } bool Writer::add( ChunkId const & id, void const * data, size_t size ) { if ( index.addChunk( id, getCurrentBundleId() ) ) { // Added to the index? Emit to the bundle then if ( getCurrentBundle().getPayloadSize() + size > storageInfo.bundle_max_payload_size() ) finishCurrentBundle(); getCurrentBundle().addChunk( id.toBlob(), data, size ); return true; } else return false; } void Writer::commit() { finishCurrentBundle(); waitForAllCompressorsToFinish(); // Move all bundles for ( size_t x = pendingBundleRenames.size(); x--; ) { PendingBundleRename & r = pendingBundleRenames[ x ]; r.first->moveOverTo( Bundle::generateFileName( r.second, bundlesDir, true ) ); } pendingBundleRenames.clear(); // Move the index file if ( indexFile.get() ) { indexFile.reset(); // Generate a random filename unsigned char buf[ 24 ]; // Same comments as for Bundle::IdSize Random::genaratePseudo( buf, sizeof( buf ) ); indexTempFile->moveOverTo( Dir::addPath( indexDir, toHex( buf, sizeof( buf ) ) ) ); indexTempFile.reset(); } } Bundle::Creator & Writer::getCurrentBundle() { if ( !currentBundle.get() ) currentBundle = new Bundle::Creator; return *currentBundle; } void Writer::finishCurrentBundle() { if ( !currentBundle.get() ) return; Bundle::Id const & bundleId = getCurrentBundleId(); if ( !indexFile.get() ) { // Create a new index file indexTempFile = tmpMgr.makeTemporaryFile(); indexFile = new IndexFile::Writer( encryptionKey, indexTempFile->getFileName() ); } indexFile->add( currentBundle->getCurrentBundleInfo(), bundleId ); sptr< TemporaryFile > file = tmpMgr.makeTemporaryFile(); pendingBundleRenames.push_back( PendingBundleRename( file, bundleId ) ); // Create a new compressor // Wait for some compressors to finish if there are too many of them Lock _( runningCompressorsMutex ); while ( runningCompressors >= maxCompressorsToRun ) runningCompressorsCondition.wait( runningCompressorsMutex ); Compressor * compressor = new Compressor( *this, currentBundle, file->getFileName() ); currentBundle.reset(); hasCurrentBundleId = false; compressor->start(); ++runningCompressors; } void Writer::waitForAllCompressorsToFinish() { Lock _( runningCompressorsMutex ); while ( runningCompressors ) runningCompressorsCondition.wait( runningCompressorsMutex ); } Bundle::Id const & Writer::getCurrentBundleId() { if ( !hasCurrentBundleId ) { // Generate a new one Random::genaratePseudo( ¤tBundleId, sizeof( currentBundleId ) ); hasCurrentBundleId = true; } return currentBundleId; } Writer::Compressor::Compressor( Writer & writer, sptr< Bundle::Creator > const & bundleCreator, string const & fileName ): writer( writer ), bundleCreator( bundleCreator ), fileName( fileName ) { } void * Writer::Compressor::Compressor::threadFunction() throw() { try { bundleCreator->write( fileName, writer.encryptionKey ); } catch( std::exception & e ) { FAIL( "Bunding writing failed: %s", e.what() ); } { Lock _( writer.runningCompressorsMutex ); CHECK( writer.runningCompressors, "no running compressors" ); --writer.runningCompressors; writer.runningCompressorsCondition.signal(); } detach(); // We're in detached thread, so no further cleanup is necessary delete this; return NULL; } Reader::Reader( StorageInfo const & storageInfo, EncryptionKey const & encryptionKey, ChunkIndex & index, string const & bundlesDir, size_t maxCacheSizeBytes ): storageInfo( storageInfo ), encryptionKey( encryptionKey ), index( index ), bundlesDir( bundlesDir ), // We need to have at least one cached reader, otherwise we would have to // unpack a bundle each time a chunk is read, even for consecutive chunks // in the same bundle cachedReaders( maxCacheSizeBytes < storageInfo.bundle_max_payload_size() ? 1 : maxCacheSizeBytes / storageInfo.bundle_max_payload_size() ) { verbosePrintf( "Using up to %zu MB of RAM as cache\n", maxCacheSizeBytes / 1048576 ); } void Reader::get( ChunkId const & chunkId, string & data, size_t & size ) { if ( Bundle::Id const * bundleId = index.findChunk( chunkId ) ) { Bundle::Reader & reader = getReaderFor( *bundleId ); reader.get( chunkId.toBlob(), data, size ); } else { string blob = chunkId.toBlob(); throw exNoSuchChunk( toHex( ( unsigned char const * ) blob.data(), blob.size() ) ); } } Bundle::Reader & Reader::getReaderFor( Bundle::Id const & id ) { sptr< Bundle::Reader > & reader = cachedReaders.entry< Bundle::Reader >( string( ( char const * ) &id, sizeof( id ) ) ); if ( !reader.get() ) { // Load the bundle reader = new Bundle::Reader( Bundle::generateFileName( id, bundlesDir, false ), encryptionKey ); } return *reader; } } zbackup-1.2/chunk_storage.hh000066400000000000000000000077531220407031500161760ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef CHUNK_STORAGE_HH_INCLUDED__ #define CHUNK_STORAGE_HH_INCLUDED__ #include #include #include #include #include #include "bundle.hh" #include "chunk_id.hh" #include "chunk_index.hh" #include "encryption_key.hh" #include "ex.hh" #include "file.hh" #include "index_file.hh" #include "mt.hh" #include "nocopy.hh" #include "objectcache.hh" #include "sptr.hh" #include "tmp_mgr.hh" #include "zbackup.pb.h" namespace ChunkStorage { using std::string; using std::vector; using std::pair; DEF_EX( Ex, "Chunk storage exception", std::exception ) /// Allows adding new chunks to the storage by filling up new bundles with them /// and writing new index files class Writer: NoCopy { public: /// All new bundles and index files are created as temp files. Call commit() /// to move them to their permanent locations. commit() is never called /// automatically! Writer( StorageInfo const &, EncryptionKey const &, TmpMgr &, ChunkIndex & index, string const & bundlesDir, string const & indexDir, size_t maxCompressorsToRun ); /// Adds the given chunk to the store. If such a chunk has already existed /// in the index, does nothing and returns false bool add( ChunkId const &, void const * data, size_t size ); /// Commits all newly created bundles. Must be called before destroying the /// object -- otherwise all work will be removed from the temp dir and lost void commit(); ~Writer(); private: /// Performs the compression in a separate thread. Destroys itself once done class Compressor: public Thread { Writer & writer; sptr< Bundle::Creator > bundleCreator; string fileName; public: Compressor( Writer &, sptr< Bundle::Creator > const &, string const & fileName ); protected: virtual void * threadFunction() throw(); }; friend class Compressor; /// Returns the id of the currently written bundle. If there's none, generates /// one. If a bundle hasn't yet started, still generates it - once the bundle /// is started, it will be used then Bundle::Id const & getCurrentBundleId(); /// Returns *currentBundle or creates a new one Bundle::Creator & getCurrentBundle(); /// Writes the current bundle and deallocates it void finishCurrentBundle(); /// Wait for all compressors to finish void waitForAllCompressorsToFinish(); StorageInfo const & storageInfo; EncryptionKey const & encryptionKey; TmpMgr & tmpMgr; ChunkIndex & index; string bundlesDir, indexDir; sptr< TemporaryFile > indexTempFile; sptr< IndexFile::Writer > indexFile; sptr< Bundle::Creator > currentBundle; Bundle::Id currentBundleId; bool hasCurrentBundleId; size_t maxCompressorsToRun; Mutex runningCompressorsMutex; Condition runningCompressorsCondition; size_t runningCompressors; /// Maps temp file of the bundle to its id blob typedef pair< sptr< TemporaryFile >, Bundle::Id > PendingBundleRename; vector< PendingBundleRename > pendingBundleRenames; }; /// Allows retrieving existing chunks by extracting them from the bundles with /// the help of an Index object class Reader: NoCopy { public: DEF_EX_STR( exNoSuchChunk, "no such chunk found:", Ex ) Reader( StorageInfo const &, EncryptionKey const &, ChunkIndex & index, string const & bundlesDir, size_t maxCacheSizeBytes ); /// Loads the given chunk from the store into the given buffer. May throw file /// and decompression exceptions. 'data' may be enlarged but won't be shrunk. /// The size of the actual chunk would be stored in 'size' void get( ChunkId const &, string & data, size_t & size ); /// Retrieves the reader for the given bundle id. May employ caching Bundle::Reader & getReaderFor( Bundle::Id const & ); private: StorageInfo const & storageInfo; EncryptionKey const & encryptionKey; ChunkIndex & index; string bundlesDir; ObjectCache cachedReaders; }; } #endif zbackup-1.2/cmake/000077500000000000000000000000001220407031500140655ustar00rootroot00000000000000zbackup-1.2/cmake/FindLibLZMA.cmake000066400000000000000000000133251220407031500170660ustar00rootroot00000000000000# - Find LibLZMA # Find LibLZMA headers and library # # LIBLZMA_FOUND - True if liblzma is found. # LIBLZMA_INCLUDE_DIRS - Directory where liblzma headers are located. # LIBLZMA_LIBRARIES - Lzma libraries to link against. # LIBLZMA_HAS_AUTO_DECODER - True if lzma_auto_decoder() is found (required). # LIBLZMA_HAS_EASY_ENCODER - True if lzma_easy_encoder() is found (required). # LIBLZMA_HAS_LZMA_PRESET - True if lzma_lzma_preset() is found (required). # LIBLZMA_VERSION_MAJOR - The major version of lzma # LIBLZMA_VERSION_MINOR - The minor version of lzma # LIBLZMA_VERSION_PATCH - The patch version of lzma # LIBLZMA_VERSION_STRING - version number as a string (ex: "5.0.3") #============================================================================= # Copyright 2008 Per Øyvind Karlsen # Copyright 2009 Alexander Neundorf # Copyright 2009 Helio Chissini de Castro # Copyright 2012 Mario Bensi # # Distributed under the OSI-approved BSD License (the "License"): # # CMake - Cross Platform Makefile Generator # Copyright 2000-2011 Kitware, Inc., Insight Software Consortium # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # * Neither the names of Kitware, Inc., the Insight Software Consortium, # nor the names of their contributors may be used to endorse or promote # products derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ------------------------------------------------------------------------------ # # The above copyright and license notice applies to distributions of # CMake in source and binary form. Some source files contain additional # notices of original copyright by their contributors; see each source # for details. Third-party software packages supplied with CMake under # compatible licenses provide their own copyright notices documented in # corresponding subdirectories. # # ------------------------------------------------------------------------------ # # CMake was initially developed by Kitware with the following sponsorship: # # * National Library of Medicine at the National Institutes of Health # as part of the Insight Segmentation and Registration Toolkit (ITK). # # * US National Labs (Los Alamos, Livermore, Sandia) ASC Parallel # Visualization Initiative. # # * National Alliance for Medical Image Computing (NAMIC) is funded by the # National Institutes of Health through the NIH Roadmap for Medical Research, # Grant U54 EB005149. # # * Kitware, Inc. #============================================================================= find_path(LIBLZMA_INCLUDE_DIR lzma.h ) find_library(LIBLZMA_LIBRARY lzma) if(LIBLZMA_INCLUDE_DIR AND EXISTS "${LIBLZMA_INCLUDE_DIR}/lzma/version.h") file(STRINGS "${LIBLZMA_INCLUDE_DIR}/lzma/version.h" LIBLZMA_HEADER_CONTENTS REGEX "#define LZMA_VERSION_[A-Z]+ [0-9]+") string(REGEX REPLACE ".*#define LZMA_VERSION_MAJOR ([0-9]+).*" "\\1" LIBLZMA_VERSION_MAJOR "${LIBLZMA_HEADER_CONTENTS}") string(REGEX REPLACE ".*#define LZMA_VERSION_MINOR ([0-9]+).*" "\\1" LIBLZMA_VERSION_MINOR "${LIBLZMA_HEADER_CONTENTS}") string(REGEX REPLACE ".*#define LZMA_VERSION_PATCH ([0-9]+).*" "\\1" LIBLZMA_VERSION_PATCH "${LIBLZMA_HEADER_CONTENTS}") set(LIBLZMA_VERSION_STRING "${LIBLZMA_VERSION_MAJOR}.${LIBLZMA_VERSION_MINOR}.${LIBLZMA_VERSION_PATCH}") unset(LIBLZMA_HEADER_CONTENTS) endif() # We're using new code known now as XZ, even library still been called LZMA # it can be found in http://tukaani.org/xz/ # Avoid using old codebase if (LIBLZMA_LIBRARY) include(CheckLibraryExists) CHECK_LIBRARY_EXISTS(${LIBLZMA_LIBRARY} lzma_auto_decoder "" LIBLZMA_HAS_AUTO_DECODER) CHECK_LIBRARY_EXISTS(${LIBLZMA_LIBRARY} lzma_easy_encoder "" LIBLZMA_HAS_EASY_ENCODER) CHECK_LIBRARY_EXISTS(${LIBLZMA_LIBRARY} lzma_lzma_preset "" LIBLZMA_HAS_LZMA_PRESET) endif () include(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibLZMA DEFAULT_MSG LIBLZMA_INCLUDE_DIR LIBLZMA_LIBRARY LIBLZMA_HAS_AUTO_DECODER LIBLZMA_HAS_EASY_ENCODER LIBLZMA_HAS_LZMA_PRESET ) if (LIBLZMA_FOUND) set(LIBLZMA_LIBRARIES ${LIBLZMA_LIBRARY}) set(LIBLZMA_INCLUDE_DIRS ${LIBLZMA_INCLUDE_DIR}) endif () mark_as_advanced( LIBLZMA_INCLUDE_DIR LIBLZMA_LIBRARY ) zbackup-1.2/debug.cc000066400000000000000000000002171220407031500144020ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later bool verboseMode = true; zbackup-1.2/debug.hh000066400000000000000000000010011220407031500144040ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef DEBUG_HH_INCLUDED__ #define DEBUG_HH_INCLUDED__ #include // Macros we use to output debugging information #ifndef NDEBUG #define dPrintf( ... ) (fprintf( stderr, __VA_ARGS__ )) #else #define dPrintf( ... ) #endif extern bool verboseMode; #define verbosePrintf( ... ) ({ if ( verboseMode ) \ fprintf( stderr, __VA_ARGS__ ); }) #endif zbackup-1.2/dir.cc000066400000000000000000000045071220407031500141000ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include #include #include "dir.hh" DIR * dir; namespace Dir { bool exists( string const & name ) { struct stat buf; return stat( name.c_str(), &buf ) == 0 && S_ISDIR( buf.st_mode ); } void create( string const & name ) { if ( mkdir( name.c_str(), 0777 ) != 0 ) throw exCantCreate( name ); } void remove( string const & name ) { if ( rmdir( name.c_str() ) != 0 ) throw exCantRemove( name ); } string addPath( string const & first, string const & second ) { if ( first.empty() ) return second; if ( second.empty() ) return first; if ( first[ first.size() - 1 ] == separator() ) return first + second; else return first + separator() + second; } string getRealPath( string const & path ) { if ( char * r = realpath( path.c_str(), NULL ) ) { string result( r ); free( r ); return result; } else throw exCantGetRealPath( path ); } string getDirName( string const & path ) { char const * c = path.c_str(); std::vector< char > copy( c, c + path.size() + 1 ); return dirname( copy.data() ); } Listing::Listing( string const & dirName ): dirName( dirName ) { dir = opendir( dirName.c_str() ); if ( !dir ) throw exCantList( dirName ); } Listing::~Listing() { closedir( dir ); } bool Listing::getNext( Entry & result ) { dirent entry; dirent * entryPtr; struct stat entryStats; for ( ; ; ) { if ( readdir_r( dir, &entry, &entryPtr ) != 0 ) throw exCantList( dirName ); if ( !entryPtr ) return false; #ifndef __APPLE__ if ( fstatat( dirfd( dir ), entry.d_name, &entryStats, AT_SYMLINK_NOFOLLOW ) != 0 ) #else if ( lstat( addPath( dirName, entry.d_name ).c_str(), &entryStats ) != 0) #endif throw exCantList( dirName ); bool isDir = S_ISDIR( entryStats.st_mode ); bool isSymLink = S_ISLNK( entryStats.st_mode ); if ( isDir && ( entry.d_name[ 0 ] == '.' && ( !entry.d_name[ 1 ] || entry.d_name[ 1 ] == '.' ) ) ) { // Skip the . or .. entries continue; } result = Entry( entry.d_name, isDir, isSymLink ); return true; } } } zbackup-1.2/dir.hh000066400000000000000000000036401220407031500141070ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef DIR_HH_INCLUDED__ #define DIR_HH_INCLUDED__ #include #include #include #include #include "ex.hh" #include "nocopy.hh" using std::string; /// Directory-related operations namespace Dir { DEF_EX( Ex, "Directory exception", std::exception ) DEF_EX_STR( exCantCreate, "Can't create directory", Ex ) DEF_EX_STR( exCantRemove, "Can't remove directory", Ex ) DEF_EX_STR( exCantList, "Can't list directory", Ex ) DEF_EX_STR( exCantGetRealPath, "Can't real path of", Ex ) /// Checks whether the given dir exists or not bool exists( string const & ); /// Creates the given directory void create( string const & ); /// Removes the given directory. It must be empty to be removed void remove( string const & ); /// Adds one path to another, e.g. for /hello/world and baz/bar, returns /// /hello/world/baz/bar string addPath( string const & first, string const & second ); /// Returns the canonicalized absolute pathname with symlinks resolved string getRealPath( string const & ); /// Returns the directory part of the given path string getDirName( string const & ); /// A separator used to separate names in the path. inline char separator() { return '/'; } class Entry { string fileName; bool dir; bool symlink; public: Entry() {} Entry( string const & fileName, bool dir, bool symlink ): fileName( fileName ), dir( dir ), symlink( symlink ) {} string const & getFileName() const { return fileName; } bool isDir() const { return dir; } bool isSymLink() const { return symlink; } }; /// Allows listing the directory class Listing: NoCopy { string dirName; DIR * dir; public: Listing( string const & dirName ); ~Listing(); /// Return true if entry was filled, false if end of dir was encountered bool getNext( Entry & ); }; } #endif zbackup-1.2/encrypted_file.cc000066400000000000000000000226361220407031500163210ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include "check.hh" #include "encrypted_file.hh" #include "endian.hh" #include "page_size.hh" #include "random.hh" namespace EncryptedFile { using Encryption::BlockSize; InputStream::InputStream( char const * fileName, EncryptionKey const & key, void const * iv_ ): file( fileName, UnbufferedFile::ReadOnly ), filePos( 0 ), key( key ), // Our buffer must be larger than BlockSize, as otherwise we won't be able // to handle PKCS#7 padding properly buffer( std::max( getPageSize(), ( unsigned ) BlockSize * 2 ) ), fill( 0 ), remainder( 0 ), backedUp( false ) { if ( key.hasKey() ) { memcpy( iv, iv_, sizeof( iv ) ); // Since we use padding, file size should be evenly dividable by the cipher // block size, and we should have at least one block UnbufferedFile::Offset size = file.size(); if ( !size || size % BlockSize ) throw exIncorrectFileSize(); } } bool InputStream::Next( void const ** data, int * size ) { // If we backed up, return the unconsumed data if ( backedUp ) backedUp = false; else { try { // Update adler32 for the previous block adler32.add( start, fill ); // Read more data if ( filePos && !remainder ) { // Once we're read a full block, we always have a remainder. If not, // this means we've hit the end of file already fill = 0; return false; } // If we have a remainder, move it to the beginning of buffer and make // it start the next block memmove( buffer.data(), start + fill, remainder ); start = buffer.data(); fill = file.read( start + remainder, buffer.size() - remainder ) + remainder; // remainder should techically be 0 now, but decrypt() will update it // anyway // remainder = 0; decrypt(); } catch( UnbufferedFile::exReadError & ) { fill = 0; // To make sure state is remaining consistent return false; } } *data = start; *size = fill; filePos += fill; return *size; } void InputStream::BackUp( int count ) { CHECK( count >= 0, "count is negative" ); if ( !backedUp ) { CHECK( (size_t) count <= fill, "Backing up too much" ); size_t consumed = fill - count; adler32.add( start, consumed ); start += consumed; fill = count; filePos -= count; backedUp = fill; // Don't make the next Next() return 0 bytes } else { CHECK( count == 0, "backing up after being backed up already" ); } } bool InputStream::Skip( int count ) { CHECK( count >= 0, "count is negative" ); // We always need to read and decrypt data, as otherwise both the state of // CBC and adler32 would be incorrect void const * data; int size; while( count ) { if ( !Next( &data, &size ) ) return false; else if ( size > count ) { BackUp( size - count ); break; } else count -= size; } return true; } int64_t InputStream::ByteCount() const { return filePos; } Adler32::Value InputStream::getAdler32() { // This makes all data consumed, if not already BackUp( 0 ); return adler32.result(); } void InputStream::read( void * buf, size_t size ) { void const * data; int avail; char * n = ( char * ) buf; while( size ) { if ( !Next( &data, &avail ) ) throw exReadFailed(); else if ( avail > ( ssize_t ) size ) { memcpy( n, data, size ); BackUp( avail - size ); break; } else { memcpy( n, data, avail ); n += avail; size -= avail; } } } void InputStream::checkAdler32() { Adler32::Value ours = getAdler32(); Adler32::Value r; read( &r, sizeof( r ) ); if ( ours != fromLittleEndian( r ) ) throw exAdlerMismatch(); } void InputStream::consumeRandomIv() { if ( key.hasKey() ) { char iv[ Encryption::IvSize ]; read( iv, sizeof( iv ) ); // read() can throw exceptions, Skip() can't } } void InputStream::decrypt() { if ( fill == buffer.size() ) { // When we have the full buffer, we set the last block of it aside and // treat the rest as the normal CBC sequence. The last block in the buffer // may be the last block of file, in which case we would need to handle // padding. That may happen the next time the function is called remainder = BlockSize; fill -= BlockSize; doDecrypt(); } else { // This is an end of file. Decrypt data treating the last block being // padded // Since we always have padding in the file and the last block is always // set apart when reading full buffers, we must have at least one block // to decrypt here doDecrypt(); // Unpad the last block if ( key.hasKey() ) fill -= BlockSize - Encryption::unpad( start + fill - BlockSize ); // We have not left any remainder this time remainder = 0; } } void InputStream::doDecrypt() { if ( !key.hasKey() ) return; // Since we use padding, file size should be evenly dividable by the cipher's // block size, and we should always have at least one block. When we get here, // we would always get the proper fill value unless those characteristics are // not met. We check for the same condition on construction, but the file // size can change while we are reading it // We don't throw an exception here as the interface we implement doesn't // support them CHECK( fill > 0 && !( fill % BlockSize ), "incorrect size of the encrypted " "file - must be non-zero and in multiples of %u", ( unsigned ) BlockSize ); // Copy the next iv prior to decrypting the data in place, as it will // not be available afterwards char newIv[ Encryption::IvSize ]; memcpy( newIv, Encryption::getNextDecryptionIv( start, fill ), sizeof( newIv ) ); // Decrypt the data Encryption::decrypt( iv, key.getKey(), start, start, fill ); // Copy the new iv memcpy( iv, newIv, sizeof( iv ) ); } OutputStream::OutputStream( char const * fileName, EncryptionKey const & key, void const * iv_ ): file( fileName, UnbufferedFile::WriteOnly ), filePos( 0 ), key( key ), buffer( getPageSize() ), start( buffer.data() ), avail( 0 ), backedUp( false ) { if ( key.hasKey() ) memcpy( iv, iv_, sizeof( iv ) ); } bool OutputStream::Next( void ** data, int * size ) { // If we backed up, return the unconsumed data if ( backedUp ) backedUp = false; else { try { // Update adler32 for the previous block adler32.add( start, avail ); // Encrypt and write the buffer if it had data if ( filePos ) encryptAndWrite( buffer.size() ); start = buffer.data(); avail = buffer.size(); } catch( UnbufferedFile::exWriteError & ) { avail = 0; // To make sure state is remaining consistent return false; } } *data = start; *size = avail; filePos += avail; return *size; } void OutputStream::BackUp( int count ) { CHECK( count >= 0, "count is negative" ); if ( !backedUp ) { CHECK( (size_t) count <= avail, "Backing up too much" ); size_t consumed = avail - count; adler32.add( start, consumed ); start += consumed; avail = count; filePos -= count; backedUp = avail; // Don't make the next Next() return 0 bytes } else { CHECK( count == 0, "backing up after being backed up already" ); } } int64_t OutputStream::ByteCount() const { return filePos; } Adler32::Value OutputStream::getAdler32() { // This makes all data consumed, if not already BackUp( 0 ); return adler32.result(); } void OutputStream::write( void const * buf, size_t size ) { void * data; int avail; char const * n = ( char const * ) buf; while( size ) { if ( !Next( &data, &avail ) ) throw exReadFailed(); else if ( avail > ( ssize_t ) size ) { memcpy( data, n, size ); BackUp( avail - size ); break; } else { memcpy( data, n, avail ); n += avail; size -= avail; } } } void OutputStream::writeAdler32() { Adler32::Value v = toLittleEndian( getAdler32() ); write( &v, sizeof( v ) ); } void OutputStream::writeRandomIv() { if ( key.hasKey() ) { char iv[ Encryption::IvSize ]; Random::genaratePseudo( iv, sizeof( iv ) ); write( iv, sizeof( iv ) ); } } void OutputStream::encryptAndWrite( size_t bytes ) { if ( key.hasKey() ) { CHECK( bytes > 0 && !( bytes % BlockSize ), "incorrect number of bytes to " "encrypt and write - must be non-zero and in multiples of %u", ( unsigned ) BlockSize ); void const * nextIv = Encryption::encrypt( iv, key.getKey(), buffer.data(), buffer.data(), bytes ); memcpy( iv, nextIv, sizeof( iv ) ); } file.write( buffer.data(), bytes ); } OutputStream::~OutputStream() { // This makes all data consumed, if not already BackUp( 0 ); // If we have the full buffer, write it first if ( start == buffer.data() + buffer.size() ) { encryptAndWrite( buffer.size() ); start = buffer.data(); } size_t bytesToWrite = start - buffer.data(); if ( key.hasKey() ) { // Perform padding size_t remainderSize = bytesToWrite % BlockSize; Encryption::pad( start - remainderSize, remainderSize ); bytesToWrite += BlockSize - remainderSize; } encryptAndWrite( bytesToWrite ); } } zbackup-1.2/encrypted_file.hh000066400000000000000000000120621220407031500163230ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ENCRYPTED_FILE_HH_INCLUDED__ #define ENCRYPTED_FILE_HH_INCLUDED__ #include #include #include #include #include #include #include "adler32.hh" #include "encryption.hh" #include "encryption_key.hh" #include "ex.hh" #include "unbuffered_file.hh" /// Google's ZeroCopyStream implementations which read and write files encrypted /// with our encryption mechanism. They also calculate adler32 of all file /// content and write/check it at the end. /// Encryption-wise we implement AES-128 in CBC mode with PKCS#7 padding. We /// don't use EVP for this currently - everyone is welcome to change this, and /// to add support for arbitrary ciphers, key lengths and modes of operations as /// well. When no encryption key is set, no encryption or padding is done, but /// everything else works the same way otherwise namespace EncryptedFile { DEF_EX( Ex, "Encrypted file exception", std::exception ) DEF_EX( exFileCorrupted, "encrypted file data is currupted", Ex ) DEF_EX( exIncorrectFileSize, "size of the encrypted file is incorrect", exFileCorrupted ) DEF_EX( exReadFailed, "read failed", Ex ) // Only thrown by InputStream::read() DEF_EX( exAdlerMismatch, "adler32 mismatch", Ex ) class InputStream: public google::protobuf::io::ZeroCopyInputStream { public: /// Opens the input file. If EncryptionKey contains no key, the input won't be /// decrypted and iv would be ignored InputStream( char const * fileName, EncryptionKey const &, void const * iv ); virtual bool Next( void const ** data, int * size ); virtual void BackUp( int count ); virtual bool Skip( int count ); virtual int64_t ByteCount() const; /// Returns adler32 of all data read so far. Calling this makes backing up /// for the previous Next() call impossible - the data has to be consumed Adler32::Value getAdler32(); /// Performs a traditional read, for convenience purposes void read( void * buf, size_t size ); /// Reads an adler32 value from the stream and compares with checkAdler32(). /// Throws an exception on mismatch void checkAdler32(); /// Reads and discards the number of bytes equivalent to an IV size. This is /// used when no IV is initially provided. /// If there's no encryption key set, does nothing void consumeRandomIv(); /// Closes the file ~InputStream() {} private: UnbufferedFile file; UnbufferedFile::Offset filePos; EncryptionKey const & key; char iv[ Encryption::IvSize ]; std::vector< char > buffer; char * start; /// Points to the start of the data currently held in buffer size_t fill; /// Number of bytes held in buffer size_t remainder; /// Number of bytes held in buffer just after the main /// 'fill'-bytes portion. We have to keep those to implement /// PKCS#7 padding bool backedUp; /// True if the BackUp operation was performed, and the buffer /// contents are therefore unconsumed Adler32 adler32; /// Decrypts 'fill' bytes at 'start', adjusting 'fill' and setting 'remainder' void decrypt(); /// Only used by decrypt() void doDecrypt(); }; class OutputStream: public google::protobuf::io::ZeroCopyOutputStream { public: /// Creates the output file. If EncryptionKey contains no key, the output /// won't be encrypted and iv would be ignored OutputStream( char const * fileName, EncryptionKey const &, void const * iv ); virtual bool Next( void ** data, int * size ); virtual void BackUp( int count ); virtual int64_t ByteCount() const; /// Returns adler32 of all data written so far. Calling this makes backing up /// for the previous Next() call impossible - the data has to be consumed Adler32::Value getAdler32(); /// Performs a traditional write, for convenience purposes void write( void const * buf, size_t size ); /// Writes the current adler32 value returned by getAdler32() to the stream void writeAdler32(); /// Writes the number of random bytes equivalent to an IV size. This is used /// when no IV is initially provided, and provides an equivalent of having /// a random IV when used just after the stream has been opened. /// If there's no encryption key set, does nothing void writeRandomIv(); /// Finishes writing and closes the file ~OutputStream(); private: UnbufferedFile file; UnbufferedFile::Offset filePos; EncryptionKey const & key; char iv[ Encryption::IvSize ]; std::vector< char > buffer; char * start; /// Points to the start of the area currently available for /// writing to in buffer size_t avail; /// Number of bytes available for writing to in buffer bool backedUp; /// True if the BackUp operation was performed, and the buffer /// contents are therefore unconsumed Adler32 adler32; /// Encrypts and writes 'bytes' bytes from the beginning of the buffer. /// 'bytes' must be non-zero and in multiples of BlockSize void encryptAndWrite( size_t bytes ); }; } #endif zbackup-1.2/encryption.cc000066400000000000000000000063311220407031500155110ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include "check.hh" #include "encryption.hh" #include "static_assert.hh" namespace Encryption { char const ZeroIv[ IvSize ] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; void const * encrypt( void const * iv, void const * keyData, void const * inData, void * outData, size_t size ) { unsigned char block[ BlockSize ]; CHECK( !( size % BlockSize ), "size of data to encrypt is not a multiple of " "block size" ); AES_KEY key; AES_set_encrypt_key( ( unsigned char const * ) keyData, KeySize * 8, &key ); void const * prev = iv; // We do the operation in block size multiples. We do XOR in size_t // multiples. The operation is endian-neutral // Make sure that BlockSize is a multiple of the size of size_t STATIC_ASSERT( !( BlockSize % sizeof( size_t ) ) ); size_t const * inS = ( size_t const * ) inData; unsigned char * out = ( unsigned char * ) outData; for ( size_t count = size / BlockSize; count--; ) { size_t const * prevS = ( size_t const * ) prev; size_t * blockS = ( size_t * ) block; for ( size_t x = BlockSize / sizeof( size_t ); x--; ) *blockS++ = *inS++ ^ *prevS++; AES_encrypt( block, out, &key ); prev = out; out += BlockSize; } return prev; } void const * getNextDecryptionIv( void const * in, size_t size ) { CHECK( !( size % BlockSize ), "size of data to decrypt is not a multiple of " "block size" ); return ( char const * ) in + size - BlockSize; } void decrypt( void const * iv, void const * keyData, void const * inData, void * outData, size_t size ) { CHECK( !( size % BlockSize ), "size of data to decrypt is not a multiple of " "block size" ); AES_KEY key; AES_set_decrypt_key( ( unsigned char const * ) keyData, KeySize * 8, &key ); // We decrypt from the end to the beginning unsigned char const * in = ( unsigned char const * ) inData + size; unsigned char * out = ( unsigned char * ) outData + size; size_t count = size / BlockSize; size_t const * prevS = ( size_t const * )( in - BlockSize ); size_t * outS = ( size_t * ) out; while( count-- ) { if ( prevS == inData ) prevS = ( size_t const * )( ( unsigned char const * ) iv + BlockSize ); in -= BlockSize; AES_decrypt( in, ( unsigned char * ) outS - BlockSize, &key ); for ( size_t x = BlockSize / sizeof( size_t ); x--; ) *--outS ^= *--prevS; } } void pad( void * data, size_t size ) { CHECK( size < BlockSize, "size to pad is too large: %zu bytes", size ); unsigned char * p = ( unsigned char * ) data + size; unsigned char v = BlockSize - size; for ( size_t count = v; count--; ) *p++ = v; } size_t unpad( void const * data ) { unsigned char const * p = ( unsigned char const * ) data + BlockSize - 1; unsigned char v = *p; if ( !v || v > BlockSize ) throw exBadPadding(); // Check the rest of the padding for ( size_t count = v - 1; count--; ) if ( *--p != v ) throw exBadPadding(); return BlockSize - v; } } zbackup-1.2/encryption.hh000066400000000000000000000040531220407031500155220ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ENCRYPTION_HH_INCLUDED__ #define ENCRYPTION_HH_INCLUDED__ #include #include #include "ex.hh" /// What we implement right now is AES-128 in CBC mode with PKCS#7 padding namespace Encryption { enum { KeySize = 16, /// Size of the key in bytes IvSize = 16, /// Size of the IV data in bytes BlockSize = 16 /// Cipher block size in bytes }; DEF_EX( exBadPadding, "Bad padding encountered", std::exception ) /// Encrypts 'size' bytes of the data pointed to by 'in', outputting 'size' /// bytes to 'out'. 'key' points to KeySize bytes of the key data. 'iv' points /// to IvSize bytes used as an initialization vector. 'in' and 'out' can be the /// same. 'size' must be a multiple of BlockSize. Returns a pointer to the /// IV which should be used to continue encrypting, which in CBC is the last /// encrypted block void const * encrypt( void const * iv, void const * key, void const * in, void * out, size_t size ); /// Returns a pointer to the IV which should be used to decrypt the block next /// to the given one, which in CBC is the last encrypted block. Note that if an /// in-place decryption is performed, this IV should be saved first, as it will /// be overwritten with the decrypted data. For size == 0, the returned pointer /// is invalid void const * getNextDecryptionIv( void const * in, size_t size ); /// The reverse of encrypt() void decrypt( void const * iv, void const * key, void const * in, void * out, size_t size ); /// Pads the last block to be encrypted, pointed to by 'data', 'size' bytes, /// which should be less than BlockSize, to occupy BlockSize bytes void pad( void * data, size_t size ); /// Returns the size of the padded data. The data itself is unchanged - use the /// first bytes of 'data'. Can throw exBadPadding size_t unpad( void const * data ); /// The IV consisting of zero bytes. Use it when there is no IV extern char const ZeroIv[ IvSize ]; } #endif zbackup-1.2/encryption_key.cc000066400000000000000000000063511220407031500163630ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include "check.hh" #include "encryption_key.hh" #include "random.hh" namespace { /// Derives an encryption key from a password and key info void deriveKey( string const & password, EncryptionKeyInfo const & info, void * key, unsigned keySize ) { CHECK( PKCS5_PBKDF2_HMAC_SHA1( password.data(), password.size(), (unsigned char const *) info.salt().data(), info.salt().size(), info.rounds(), keySize, (unsigned char *) key ) == 1, "encryption key derivation failed" ); } string calculateKeyHmac( void const * key, unsigned keySize, string const & input ) { char result[ EVP_MAX_MD_SIZE ]; unsigned resultSize; CHECK( HMAC( EVP_sha1(), (unsigned char const *) key, keySize, (unsigned char const *) input.data(), input.size(), (unsigned char *) result, &resultSize ), "encryption key HMAC calcuation failed" ); return string( result, result + resultSize ); } } EncryptionKey::EncryptionKey( string const & password, EncryptionKeyInfo const * info ) { if ( !info ) isSet = false; else { isSet = true; char derivedKey[ KeySize ]; deriveKey( password, *info, derivedKey, sizeof( derivedKey ) ); AES_KEY aesKey; AES_set_decrypt_key( ( unsigned char const * ) derivedKey, 128, &aesKey ); AES_decrypt( ( unsigned char const * ) info->encrypted_key().data(), ( unsigned char * ) key, &aesKey ); if ( calculateKeyHmac( key, sizeof( key ), info->key_check_input() ) != info->key_check_hmac() ) throw exInvalidPassword(); } } EncryptionKey::~EncryptionKey() { // Clear the key from memory memset( key, 0, sizeof( key ) ); } void EncryptionKey::generate( string const & password, EncryptionKeyInfo & info ) { // Use this buf for salts char buf[ 16 ]; Random::genaratePseudo( buf, sizeof( buf ) ); info.set_salt( buf, sizeof( buf ) ); info.set_rounds( 10000 ); // TODO: make this configurable char derivedKey[ KeySize ]; deriveKey( password, info, derivedKey, sizeof( derivedKey ) ); char key[ KeySize ]; Random::genarateTrue( key, sizeof( key ) ); // Fill in the HMAC verification part Random::genaratePseudo( buf, sizeof( buf ) ); info.set_key_check_input( buf, sizeof( buf ) ); info.set_key_check_hmac( calculateKeyHmac( key, sizeof( key ), info.key_check_input() ) ); // Encrypt the key AES_KEY aesKey; AES_set_encrypt_key( ( unsigned char const * ) derivedKey, 128, &aesKey ); char encryptedKey[ sizeof( key ) ]; AES_encrypt( ( unsigned char const * ) key, ( unsigned char * ) encryptedKey, &aesKey ); info.set_encrypted_key( encryptedKey, sizeof( encryptedKey ) ); // Clear the key from memory memset( key, 0, sizeof( key ) ); } EncryptionKey const & EncryptionKey::noKey() { static EncryptionKey key( string(), NULL ); return key; } zbackup-1.2/encryption_key.hh000066400000000000000000000025421220407031500163730ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ENCRYPTION_KEY_HH_INCLUDED__ #define ENCRYPTION_KEY_HH_INCLUDED__ #include #include #include "ex.hh" #include "zbackup.pb.h" using std::string; class EncryptionKey { bool isSet; unsigned const static KeySize = 16; // TODO: make this configurable char key[ KeySize ]; public: DEF_EX( exInvalidPassword, "Invalid password specified", std::exception ) /// Decodes the encryption key from the given info and password. If info is /// passed as NULL, the password is ignored and no key is set EncryptionKey( string const & password, EncryptionKeyInfo const * ); ~EncryptionKey(); /// Returns true if key was set, false otherwise. bool hasKey() const { return isSet; } /// Returns the key. Check if there is one with hasKey() first. Note: the key /// should not be copied, as it may be allocated in a locked page in the /// future void const * getKey() const { return key; } /// Returns key size, in bytes unsigned getKeySize() const { return sizeof( key ); } /// Generates new key info using the given password static void generate( string const & password, EncryptionKeyInfo & ); /// Returns a static instance without any key set static EncryptionKey const & noKey(); }; #endif zbackup-1.2/endian.hh000066400000000000000000000016241220407031500145670ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ENDIAN_HH_INCLUDED__ #define ENDIAN_HH_INCLUDED__ #include #include #ifdef __APPLE__ #include #else #include #endif #if __BYTE_ORDER != __LITTLE_ENDIAN #error Please add support for architectures different from little-endian. #endif /// Converts the given host-order value to big-endian value inline uint32_t toBigEndian( uint32_t v ) { return htonl( v ); } /// Converts the given host-order value to little-endian value inline uint32_t toLittleEndian( uint32_t v ) { return v; } inline uint64_t toLittleEndian( uint64_t v ) { return v; } /// Converts the given little-endian value to host-order value inline uint32_t fromLittleEndian( uint32_t v ) { return v; } inline uint64_t fromLittleEndian( uint64_t v ) { return v; } #endif zbackup-1.2/ex.hh000066400000000000000000000031511220407031500137420ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef EX_HH_INCLUDED__ #define EX_HH_INCLUDED__ #include #include #include /// A way to declare an exception class fast /// Do like this: /// DEF_EX( exErrorInFoo, "An error in foo encountered", std::exception ) /// DEF_EX( exFooNotFound, "Foo was not found", exErrorInFoo ) #define DEF_EX( exName, exDescription, exParent ) \ class exName: public exParent { \ public: \ virtual const char * what() const throw() { return (exDescription); } \ virtual ~exName() throw() {} }; /// Same as DEF_EX, but takes a runtime string argument, which gets concatenated /// with the description. /// /// DEF_EX_STR( exCantOpen, "can't open file", std::exception ) /// ... /// throw exCantOpen( "example.txt" ); /// /// what() would return "can't open file example.txt" #define DEF_EX_STR( exName, exDescription, exParent ) \ class exName: public exParent { \ std::string value; \ public: \ exName( std::string const & value_ ): value( std::string( exDescription ) + " " + value_ ) {} \ exName( char const * value_, unsigned size ): value( std::string( exDescription ) + " " + std::string( value_, size ) ) {} \ virtual const char * what() const throw() { return value.c_str(); } \ virtual ~exName() throw() {} }; /// An exception class to wrap leave code into an std::exception class exLeaveWrapped: public std::exception { char buf[ 32 ]; public: exLeaveWrapped( int error ) { sprintf( buf, "%d", error ); } char const * what() const throw() { return buf; } }; #endif zbackup-1.2/file.cc000066400000000000000000000152331220407031500142370ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include #include "file.hh" enum { // We employ a writing buffer to considerably speed up file operations when // they consists of many small writes. The default size for the buffer is 64k WriteBufferSize = 65536 }; bool File::exists( char const * filename ) throw() { #ifdef __WIN32 struct _stat buf; return _stat( filename, &buf ) == 0; #else struct stat buf; // EOVERFLOW rationale: if the file is too large, it still does exist return stat( filename, &buf ) == 0 || errno == EOVERFLOW; #endif } void File::erase( std::string const & filename ) throw( exCantErase ) { if ( remove( filename.c_str() ) != 0 ) throw exCantErase( filename ); } void File::rename( std::string const & from, std::string const & to ) throw( exCantRename ) { if ( ::rename( from.c_str(), to.c_str() ) != 0 ) throw exCantRename( from + " to " + to ); } void File::open( char const * filename, OpenMode mode ) throw( exCantOpen ) { char const * m; switch( mode ) { case Update: m = "r+b"; break; case WriteOnly: m = "wb"; break; default: m = "rb"; } f = fopen( filename, m ); if ( !f ) throw exCantOpen( std::string( filename ) + ": " + strerror( errno ) ); } File::File( char const * filename, OpenMode mode ) throw( exCantOpen ): writeBuffer( 0 ) { open( filename, mode ); } File::File( std::string const & filename, OpenMode mode ) throw( exCantOpen ): writeBuffer( 0 ) { open( filename.c_str(), mode ); } void File::read( void * buf, size_t size ) throw( exReadError, exWriteError ) { if ( !size ) return; if ( writeBuffer ) flushWriteBuffer(); size_t result = fread( buf, size, 1, f ); if ( result != 1 ) { if ( !ferror( f ) ) throw exShortRead(); else throw exReadErrorDetailed( f ); } } size_t File::readRecords( void * buf, size_t size, size_t count ) throw( exWriteError ) { if ( writeBuffer ) flushWriteBuffer(); return fread( buf, size, count, f ); } void File::write( void const * buf, size_t size ) throw( exWriteError ) { if ( !size ) return; if ( size >= WriteBufferSize ) { // If the write is large, there's not much point in buffering flushWriteBuffer(); size_t result = fwrite( buf, size, 1, f ); if ( result != 1 ) throw exWriteError(); return; } if ( !writeBuffer ) { // Allocate the writing buffer since we don't have any yet writeBuffer = new char[ WriteBufferSize ]; writeBufferLeft = WriteBufferSize; } size_t toAdd = size < writeBufferLeft ? size : writeBufferLeft; memcpy( writeBuffer + ( WriteBufferSize - writeBufferLeft ), buf, toAdd ); size -= toAdd; writeBufferLeft -= toAdd; if ( !writeBufferLeft ) // Out of buffer? Flush it { flushWriteBuffer(); if ( size ) // Something's still left? Add to buffer { memcpy( writeBuffer, (char const *)buf + toAdd, size ); writeBufferLeft -= size; } } } size_t File::writeRecords( void const * buf, size_t size, size_t count ) throw( exWriteError ) { flushWriteBuffer(); return fwrite( buf, size, count, f ); } char * File::gets( char * s, int size, bool stripNl ) throw( exWriteError ) { if ( writeBuffer ) flushWriteBuffer(); char * result = fgets( s, size, f ); if ( result && stripNl ) { size_t len = strlen( result ); char * last = result + len; while( len-- ) { --last; if ( *last == '\n' || *last == '\r' ) *last = 0; else break; } } return result; } std::string File::gets( bool stripNl ) throw( exReadError, exWriteError ) { char buf[ 1024 ]; if ( !gets( buf, sizeof( buf ), stripNl ) ) { if ( !ferror( f ) ) throw exShortRead(); else throw exReadErrorDetailed( f ); } return std::string( buf ); } void File::seek( long offset ) throw( exSeekError, exWriteError ) { if ( writeBuffer ) flushWriteBuffer(); if ( fseek( f, offset, SEEK_SET ) != 0 ) throw exSeekError(); } void File::seekCur( long offset ) throw( exSeekError, exWriteError ) { if ( writeBuffer ) flushWriteBuffer(); if ( fseek( f, offset, SEEK_CUR ) != 0 ) throw exSeekError(); } void File::seekEnd( long offset ) throw( exSeekError, exWriteError ) { if ( writeBuffer ) flushWriteBuffer(); if ( fseek( f, offset, SEEK_END ) != 0 ) throw exSeekError(); } void File::rewind() throw( exSeekError, exWriteError ) { seek( 0 ); } size_t File::tell() throw( exSeekError ) { long result = ftell( f ); if ( result == -1 ) throw exSeekError(); if ( writeBuffer ) result += ( WriteBufferSize - writeBufferLeft ); return ( size_t ) result; } size_t File::size() throw( exSeekError, exWriteError ) { size_t cur = tell(); seekEnd( 0 ); size_t result = tell(); seek( cur ); return result; } bool File::eof() throw( exWriteError ) { if ( writeBuffer ) flushWriteBuffer(); return feof( f ); } FILE * File::file() throw( exWriteError ) { flushWriteBuffer(); return f; } FILE * File::release() throw( exWriteError ) { releaseWriteBuffer(); FILE * c = f; f = 0; return c; } void File::close() throw( exWriteError ) { fclose( release() ); } File::~File() throw() { if ( f ) { try { releaseWriteBuffer(); } catch( exWriteError & ) { } fclose( f ); } } void File::flushWriteBuffer() throw( exWriteError ) { if ( writeBuffer && writeBufferLeft != WriteBufferSize ) { size_t result = fwrite( writeBuffer, WriteBufferSize - writeBufferLeft, 1, f ); if ( result != 1 ) throw exWriteError(); writeBufferLeft = WriteBufferSize; } } void File::releaseWriteBuffer() throw( exWriteError ) { flushWriteBuffer(); if ( writeBuffer ) { delete [] writeBuffer; writeBuffer = 0; } } File::exReadErrorDetailed::exReadErrorDetailed( int fd ) { buildDescription( fd ); } File::exReadErrorDetailed::exReadErrorDetailed( FILE * f ) { buildDescription( fileno( f ) ); } void File::exReadErrorDetailed::buildDescription( int fd ) { description = "Error reading from file "; char path[ PATH_MAX ]; char procFdLink[ 48 ]; sprintf( procFdLink, "/proc/self/fd/%d", fd ); int pathChars = readlink( procFdLink, path, sizeof( path ) ); if ( pathChars < 0 ) description += "(unknown)"; else description.append( path, pathChars ); } const char * File::exReadErrorDetailed::what() const throw() { return description.c_str(); } File::exReadErrorDetailed::~exReadErrorDetailed() throw () { } zbackup-1.2/file.hh000066400000000000000000000123571220407031500142550ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef FILE_HH_INCLUDED__ #define FILE_HH_INCLUDED__ #include #include #include #include #include "ex.hh" using std::string; /// A simple wrapper over FILE * operations with added write-buffering class File { FILE * f; char * writeBuffer; size_t writeBufferLeft; public: DEF_EX( Ex, "File exception", std::exception ) DEF_EX_STR( exCantOpen, "Can't open", Ex ) DEF_EX( exReadError, "Error reading from file", Ex ) DEF_EX( exShortRead, "Short read from the file", exReadError ) DEF_EX( exWriteError, "Error writing to the file", Ex ) DEF_EX( exSeekError, "File seek error", Ex ) DEF_EX_STR( exCantErase, "Can't erase file", Ex ) DEF_EX_STR( exCantRename, "Can't rename file", Ex ) enum OpenMode { ReadOnly, WriteOnly, Update }; typedef long Offset; File( char const * filename, OpenMode ) throw( exCantOpen ); File( std::string const & filename, OpenMode ) throw( exCantOpen ); /// Reads the number of bytes to the buffer, throws an error if it /// failed to fill the whole buffer (short read, i/o error etc) void read( void * buf, size_t size ) throw( exReadError, exWriteError ); template< typename T > void read( T & value ) throw( exReadError, exWriteError ) { read( &value, sizeof( value ) ); } template< typename T > T read() throw( exReadError, exWriteError ) { T value; read( value ); return value; } /// Attempts reading at most 'count' records sized 'size'. Returns /// the number of records it managed to read, up to 'count' size_t readRecords( void * buf, size_t size, size_t count ) throw( exWriteError ); /// Writes the number of bytes from the buffer, throws an error if it /// failed to write the whole buffer (short write, i/o error etc). /// This function employs write buffering, and as such, writes may not /// end up on disk immediately, or a short write may occur later /// than it really did. If you don't want write buffering, use /// writeRecords() function instead void write( void const * buf, size_t size ) throw( exWriteError ); template< typename T > void write( T const & value ) throw( exWriteError ) { write( &value, sizeof( value ) ); } /// Attempts writing at most 'count' records sized 'size'. Returns /// the number of records it managed to write, up to 'count'. /// This function does not employ buffering, but flushes the buffer if it /// was used before size_t writeRecords( void const * buf, size_t size, size_t count ) throw( exWriteError ); /// Reads a string from the file. Unlike the normal fgets(), this one /// can strip the trailing newline character, if this was requested. /// Returns either s or 0 if no characters were read char * gets( char * s, int size, bool stripNl = false ) throw( exWriteError ); /// Like the above, but uses its own local internal buffer (1024 bytes /// currently), and strips newlines by default std::string gets( bool stripNl = true ) throw( exReadError, exWriteError ); /// Seeks in the file, relative to its beginning void seek( long offset ) throw( exSeekError, exWriteError ); /// Seeks in the file, relative to the current position void seekCur( long offset ) throw( exSeekError, exWriteError ); /// Seeks in the file, relative to the end of file void seekEnd( long offset = 0 ) throw( exSeekError, exWriteError ); /// Seeks to the beginning of file void rewind() throw( exSeekError, exWriteError ); /// Tells the current position within the file, relative to its beginning size_t tell() throw( exSeekError ); /// Returns file size size_t size() throw( exSeekError, exWriteError ); /// Returns true if end-of-file condition is set bool eof() throw( exWriteError ); /// Returns the underlying FILE * record, so other operations can be /// performed on it FILE * file() throw( exWriteError ); /// Releases the file handle out of the control of the class. No further /// operations are valid. The file will not be closed on destruction FILE * release() throw( exWriteError ); /// Closes the file. No further operations are valid void close() throw( exWriteError ); /// Checks if the file exists or not static bool exists( char const * filename ) throw(); static bool exists( std::string const & filename ) throw() { return exists( filename.c_str() ); } ~File() throw(); /// Erases the given file static void erase( std::string const & ) throw( exCantErase ); /// Renames the given file static void rename( std::string const & from, std::string const & to ) throw( exCantRename ); /// Throwing this class instead of exReadError will make the description /// include the file name class exReadErrorDetailed: public exReadError { string description; public: exReadErrorDetailed( int fd ); exReadErrorDetailed( FILE * f ); virtual const char * what() const throw(); virtual ~exReadErrorDetailed() throw (); private: void buildDescription( int fd ); }; private: void open( char const * filename, OpenMode ) throw( exCantOpen ); void flushWriteBuffer() throw( exWriteError ); void releaseWriteBuffer() throw( exWriteError ); }; #endif zbackup-1.2/hex.cc000066400000000000000000000013601220407031500141000ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "hex.hh" using std::string; namespace { /// Converts 'size' bytes pointed to by 'in' into a hex string pointed to by /// 'out'. It should have at least size * 2 bytes. No trailing zero is added void hexify( unsigned char const * in, unsigned size, char * out ) { while( size-- ) { unsigned char v = *in++; *out++ = ( v >> 4 < 10 ) ? '0' + ( v >> 4 ) : 'a' + ( v >> 4 ) - 10; *out++ = ( ( v & 0xF ) < 10 ) ? '0' + ( v & 0xF ) : 'a' + ( v & 0xF ) - 10; } } } string toHex( unsigned char const * in, unsigned size ) { string result( size * 2, 0 ); hexify( in, size, &result[ 0 ] ); return result; } zbackup-1.2/hex.hh000066400000000000000000000005031220407031500141100ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef HEX_HH_INCLUDED__ #define HEX_HH_INCLUDED__ #include /// Converts 'size' bytes pointed to by 'in' into a hex string std::string toHex( unsigned char const * in, unsigned size ); #endif zbackup-1.2/index_file.cc000066400000000000000000000032401220407031500154210ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include "bundle.hh" #include "encryption.hh" #include "index_file.hh" #include "message.hh" namespace IndexFile { enum { FileFormatVersion = 1 }; Writer::Writer( EncryptionKey const & key, string const & fileName ): stream( fileName.c_str(), key, Encryption::ZeroIv ) { stream.writeRandomIv(); FileHeader header; header.set_version( FileFormatVersion ); Message::serialize( header, stream ); } void Writer::add( BundleInfo const & info, Bundle::Id const & bundleId ) { IndexBundleHeader header; header.set_id( &bundleId, sizeof( bundleId ) ); Message::serialize( header, stream ); Message::serialize( info, stream ); } Writer::~Writer() { // Final record which does not have a bundle id IndexBundleHeader header; Message::serialize( header, stream ); stream.writeAdler32(); } Reader::Reader( EncryptionKey const & key, string const & fileName ): stream( fileName.c_str(), key, Encryption::ZeroIv ) { stream.consumeRandomIv(); FileHeader header; Message::parse( header, stream ); if ( header.version() != FileFormatVersion ) throw exUnsupportedVersion(); } bool Reader::readNextRecord( BundleInfo & info, Bundle::Id & bundleId ) { IndexBundleHeader header; Message::parse( header, stream ); if ( header.has_id() ) { if ( header.id().size() != sizeof( bundleId ) ) throw exIncorrectBundleIdSize(); memcpy( &bundleId, header.id().data(), sizeof( bundleId ) ); Message::parse( info, stream ); return true; } else { stream.checkAdler32(); return false; } } } zbackup-1.2/index_file.hh000066400000000000000000000030641220407031500154370ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef INDEX_FILE_HH_INCLUDED__ #define INDEX_FILE_HH_INCLUDED__ #include #include #include "adler32.hh" #include "bundle.hh" #include "encrypted_file.hh" #include "encryption_key.hh" #include "ex.hh" #include "file.hh" #include "nocopy.hh" #include "zbackup.pb.h" /// Index files store all existing chunk ids and their bundle ids. This /// information can also be retrieved by scanning all bundle files, but that /// would incur a lot of disk seeks which we want to minimize here namespace IndexFile { using std::string; /// Creates index files class Writer: NoCopy { EncryptedFile::OutputStream stream; public: /// Creates a new chunk log. Initially it is stored in a temporary file Writer( EncryptionKey const &, string const & fileName ); /// Adds a bundle info to the log void add( BundleInfo const &, Bundle::Id const & bundleId ); /// Finalizes the file ~Writer(); }; /// Reads index files class Reader: NoCopy { EncryptedFile::InputStream stream; public: DEF_EX( Ex, "Index file reader exception", std::exception ) DEF_EX( exUnsupportedVersion, "Unsupported version of the index file format", Ex ) DEF_EX( exIncorrectBundleIdSize, "Incorrect bundle id size encountered", Ex ) Reader( EncryptionKey const &, string const & fileName ); /// Reads the next record from the file. Returns false if no more records can /// be found bool readNextRecord( BundleInfo &, Bundle::Id & bundleId ); }; } #endif zbackup-1.2/message.cc000066400000000000000000000020161220407031500147370ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "message.hh" #include namespace Message { void serialize( MessageLite const & message, ZeroCopyOutputStream & stream ) { CodedOutputStream cos( &stream ); serialize( message, cos ); } void serialize( MessageLite const & message, CodedOutputStream & cos ) { cos.WriteVarint32( message.ByteSize() ); message.SerializeWithCachedSizes( &cos ); if ( cos.HadError() ) throw exCantSerialize( message.GetTypeName() ); } void parse( MessageLite & message, ZeroCopyInputStream & stream ) { CodedInputStream cis( &stream ); parse( message, cis ); } void parse( MessageLite & message, CodedInputStream & cis ) { uint32_t v; if ( !cis.ReadVarint32( &v ) ) throw exCantParse( message.GetTypeName() ); CodedInputStream::Limit limit = cis.PushLimit( v ); if( !message.ParseFromCodedStream( &cis ) ) throw exCantParse( message.GetTypeName() ); cis.PopLimit( limit ); } } zbackup-1.2/message.hh000066400000000000000000000024401220407031500147520ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef MESSAGE_HH_INCLUDED__ #define MESSAGE_HH_INCLUDED__ #include #include #include #include #include "ex.hh" /// Some utilities for protobuffer messages namespace Message { DEF_EX( Ex, "Message exception", std::exception ) DEF_EX_STR( exCantParse, "Can't parse message", Ex ) DEF_EX_STR( exCantSerialize, "Can't serialize message", Ex ) using google::protobuf::io::ZeroCopyOutputStream; using google::protobuf::io::ZeroCopyInputStream; using google::protobuf::io::CodedInputStream; using google::protobuf::io::CodedOutputStream; using google::protobuf::MessageLite; /// Serializes the given message to the given zero-copy stream void serialize( MessageLite const &, ZeroCopyOutputStream & ); /// Serializes the given message to the given coded stream void serialize( MessageLite const &, CodedOutputStream & ); /// Reads and parses the given message from the given zero-copy stream void parse( MessageLite &, ZeroCopyInputStream & ); /// Reads and parses the given message from the given coded stream void parse( MessageLite &, CodedInputStream & ); } #endif zbackup-1.2/mt.cc000066400000000000000000000024731220407031500137420ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "mt.hh" #include #include "check.hh" Mutex::Mutex() { pthread_mutex_init( &mutex, 0 ); } void Mutex::lock() { pthread_mutex_lock( &mutex ); } void Mutex::unlock() { pthread_mutex_unlock( &mutex ); } Mutex::~Mutex() { pthread_mutex_destroy( &mutex ); } Condition::Condition() { pthread_cond_init( &cond, 0 ); } void Condition::signal() { pthread_cond_signal( &cond ); } void Condition::broadcast() { pthread_cond_broadcast( &cond ); } void Condition::wait( Mutex & m ) { pthread_cond_wait( &cond, &m.mutex ); } Condition::~Condition() { pthread_cond_destroy( &cond ); } void * Thread::__thread_routine( void * param ) { return ( (Thread *)param ) -> threadFunction(); } void Thread::start() { CHECK( pthread_create( &thread, 0, &__thread_routine, this ) == 0, "pthread_create() failed" ); } void Thread::detach() { CHECK( pthread_detach( thread ) == 0, "pthread_detach() failed" ); } void * Thread::join() { void * ret; pthread_join( thread, &ret ); return ret; } size_t getNumberOfCpus() { long result = sysconf( _SC_NPROCESSORS_ONLN ); // Handle -1 and also sanitize the 0 value which wouldn't make sense return result < 1 ? 1 : result; } zbackup-1.2/mt.hh000066400000000000000000000024271220407031500137530ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef MT_HH_INCLUDED__ #define MT_HH_INCLUDED__ #include #include #include "nocopy.hh" /// Multithreading class Condition; class Mutex { friend class Condition; pthread_mutex_t mutex; public: Mutex(); /// Please consider using the Lock class instead void lock(); void unlock(); ~Mutex(); }; class Lock: NoCopy { Mutex * m; public: Lock( Mutex & mutex ): m( &mutex ) { m->lock(); } ~Lock() { m->unlock(); } }; /// Condition variable. Atomically unlocks the given mutex before it suspends /// waiting for event, and upon the awakening reacquires it class Condition { pthread_cond_t cond; public: Condition(); void signal(); void broadcast(); /// Mutex must be locked on entrance void wait( Mutex & m ); ~Condition(); }; class Thread { public: void start(); void detach(); void * join(); virtual ~Thread() {} protected: /// This is the function that is meant to work in a separate thread virtual void * threadFunction() throw()=0; private: pthread_t thread; static void * __thread_routine( void * ); }; /// Returns the number of CPUs this system has size_t getNumberOfCpus(); #endif zbackup-1.2/nocopy.hh000066400000000000000000000006311220407031500146350ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef NOCOPY_HH_INCLUDED__ #define NOCOPY_HH_INCLUDED__ /// A simple class to disallow copying of the class objects. Inherit from it to /// use it class NoCopy { public: NoCopy() {} private: NoCopy( NoCopy const & ); NoCopy & operator = ( NoCopy const & ); }; #endif // NOCOPY_HH zbackup-1.2/objectcache.cc000066400000000000000000000020351220407031500155460ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "objectcache.hh" ObjectCache::ObjectCache( unsigned maxSize_ ): maxObjects( maxSize_ ), totalObjects( 0 ) { } bool ObjectCache::remove( ObjectId const & id ) { Objects tmp; tmp.push_back( Object() ); tmp.back().id = id; ObjectMap::iterator i = objectMap.find( tmp.begin() ); if ( i == objectMap.end() ) return false; // Make sure that in case a destructor raises an exception, the cache // is left in a consistent state. Reference * ref = (*i)->reference; objects.erase( *i ); objectMap.erase( i ); --totalObjects; delete ref; return true; } void ObjectCache::clear() { for ( Objects::iterator i = objects.begin(); i != objects.end(); ) { // Make sure that in case a destructor raises an exception, the cache // is left in a consistent state. Reference * ref = i->reference; objectMap.erase( i ); objects.erase( i++ ); --totalObjects; delete ref; } } zbackup-1.2/objectcache.hh000066400000000000000000000061021220407031500155570ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef OBJECTCACHE_HH_INCLUDED__ #define OBJECTCACHE_HH_INCLUDED__ #include #include #include #include #include "sptr.hh" #include "nocopy.hh" /// ObjectCache allows caching dynamically-allocated objects of any type. The /// size of the cache is upper-bound and is specified at construction-time. /// Newly added or recently found objects are placed to the top of the internal /// stack. When there's no space in the cache, object become removed from the /// bottom of it class ObjectCache: NoCopy { public: ObjectCache( unsigned maxObjects ); /// Id of the object being stored in the cache typedef std::string ObjectId; /// Returns a reference to the stored object with the given id, or creates /// one if none existed. The caller must know the expected type of the object /// and specify it explicitly template< class T > sptr< T > & entry( ObjectId const & ); /// Removes a stored object with the given id. Returns true if the object /// was removed, false if it didn't exist in the cache bool remove( ObjectId const & ); /// Deletes all the objects from cache void clear(); ~ObjectCache() { clear(); } private: /// Base class for a reference to an object being stored struct Reference: NoCopy { virtual ~Reference() {} }; /// Having this class allows to delete T via virtual destructor accessible /// from the base Reference class template< class T > struct ReferenceTo: public Reference { sptr< T > ref; }; struct Object { ObjectId id; Reference * reference; }; typedef std::list< Object > Objects; struct ObjectsIteratorComp { bool operator () ( Objects::iterator const & x, Objects::iterator const & y ) { return x->id < y->id; } }; typedef std::set< Objects::iterator, ObjectsIteratorComp > ObjectMap; unsigned maxObjects; Objects objects; unsigned totalObjects; ObjectMap objectMap; }; template< class T > sptr< T > & ObjectCache::entry( ObjectId const & id ) { Objects tmp; tmp.push_back( Object() ); tmp.back().id = id; std::pair< ObjectMap::iterator, bool > r = objectMap.insert( tmp.begin() ); if ( r.second ) { // The object was created // Init the reference ReferenceTo< T > * refTo = new ReferenceTo< T >(); tmp.back().reference = refTo; // Add the object to top of our objects objects.splice( objects.begin(), tmp ); ++totalObjects; // evict an entry at the bottom, if needed if ( totalObjects > maxObjects ) { Objects::iterator i = --objects.end(); objectMap.erase( i ); Reference * ref = i->reference; objects.pop_back(); --totalObjects; delete ref; // We expect that it may throw } return refTo->ref; } else { // The object was existent // Move it to the top objects.splice( objects.begin(), objects, *r.first ); return dynamic_cast< ReferenceTo< T > & >( *objects.front().reference ).ref; } } #endif // OBJECTCACHE_HH zbackup-1.2/page_size.cc000066400000000000000000000004431220407031500152630ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "page_size.hh" #include unsigned getPageSize() { static unsigned value = 0; if ( !value ) value = sysconf( _SC_PAGESIZE ); return value; } zbackup-1.2/page_size.hh000066400000000000000000000004051220407031500152730ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef PAGE_SIZE_HH_INCLUDED__ #define PAGE_SIZE_HH_INCLUDED__ /// Returns the page size used by this system unsigned getPageSize(); #endif zbackup-1.2/random.cc000066400000000000000000000007221220407031500145750ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "random.hh" #include namespace Random { void genarateTrue( void * buf, unsigned size ) { if ( RAND_bytes( (unsigned char *) buf, size ) != 1 ) throw exCantGenerate(); } void genaratePseudo( void * buf, unsigned size ) { if ( RAND_pseudo_bytes( (unsigned char *) buf, size ) < 0 ) throw exCantGenerate(); } } zbackup-1.2/random.hh000066400000000000000000000011221220407031500146020ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef RANDOM_HH_INCLUDED__ #define RANDOM_HH_INCLUDED__ #include #include "ex.hh" namespace Random { DEF_EX( exCantGenerate, "Error generating random sequence, try later", std::exception ) /// This one fills the buffer with true randomness, suitable for a key void genarateTrue( void * buf, unsigned size ); /// This one fills the buffer with pseudo randomness, suitable for salts but not /// keys void genaratePseudo( void * buf, unsigned size ); } #endif zbackup-1.2/rolling_hash.cc000066400000000000000000000011071220407031500157640ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "rolling_hash.hh" RollingHash::RollingHash() { reset(); } void RollingHash::reset() { count = 0; factor = 0; nextFactor = 1; value = 0; } RollingHash::Digest RollingHash::digest( void const * buf, unsigned size ) { // TODO: this can be optimized, as in this case there's no need to calculate // factor values. RollingHash hash; for ( char const * p = ( char const * )buf; size--; ) hash.rollIn( *p++ ); return hash.digest(); } zbackup-1.2/rolling_hash.hh000066400000000000000000000043721220407031500160050ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ROLLING_HASH_HH_INCLUDED__ #define ROLLING_HASH_HH_INCLUDED__ #include #include // Modified Rabin-Karp rolling hash with the base of 257 and the modulo of 2^64. // The canonical RK hash calculates the following value (e.g. for 4 bytes): // hash = ( v1*b^3 + v2*b^2 + v3*b + v4 ) % m // where v1, v2, v3 and v4 are the sequence of bytes, b is the base and m // is the modulo. // We add b^4 in the mix: // hash = ( b^4 + v1*b^3 + v2*b^2 + v3*b + v4 ) % m // This fixes collisions where sequences only differ in the amount of zero // bytes in the beginning (those amount to zero in the canonical RK), since the // power of b in the first member depends on the total number of bytes in the // sequence. // The choice of base: 257 is easy to multiply by (only two bits are set), and // is the first prime larger than the value of any byte. It's easy to create // collisions with the smaller primes: two-byte sequences '1, 0' and '0, base' // would collide, for example. // The choice of modulo: 32-bit is impractical due to birthday paradox -- you // get a collision with the 50% probability having only 77000 hashes. With // 64-bit, the number of hashes to have the same probability would be 5.1 // billion. With the block size of 64k, that would amount to 303 terabytes of // data stored, which should be enough for our purposes. // Note: ( a = ( a << 8 ) + a ) is equivalent to ( a *= 257 ) class RollingHash { uint64_t factor; uint64_t nextFactor; uint64_t value; size_t count; public: typedef uint64_t Digest; RollingHash(); void reset(); void rollIn( char c ) { factor = nextFactor; nextFactor = ( nextFactor << 8 ) + nextFactor; // nextFactor *= 257 value = ( value << 8 ) + value; value += ( unsigned char ) c; ++count; } void rotate( char in, char out ) { value -= uint64_t( ( unsigned char ) out ) * factor; value = ( value << 8 ) + value; // value *= 257 value += ( unsigned char ) in; } Digest digest() const { return value + nextFactor; } size_t size() const { return count; } static Digest digest( void const * buf, unsigned size ); }; #endif zbackup-1.2/sha256.cc000066400000000000000000000007431220407031500143300ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "sha256.hh" Sha256::Sha256() { SHA256_Init( &ctx ); } void Sha256::add( void const * data, size_t size ) { SHA256_Update( &ctx, data, size ); } void Sha256::finish( void * result ) { SHA256_Final( ( unsigned char * ) result, &ctx ); } string Sha256::finish() { char buf[ Size ]; finish( buf ); return string( buf, buf + sizeof( buf ) ); } zbackup-1.2/sha256.hh000066400000000000000000000012151220407031500143350ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef SHA256_HH_INCLUDED__ #define SHA256_HH_INCLUDED__ #include #include #include using std::string; /// A simple wrapper over openssl class Sha256 { SHA256_CTX ctx; public: enum { // Number of bytes a digest has Size = SHA256_DIGEST_LENGTH }; Sha256(); /// Adds more data void add( void const * data, size_t size ); /// Result should point at at least Size bytes void finish( void * result ); /// Returns result as a string blob string finish(); }; #endif zbackup-1.2/sptr.hh000066400000000000000000000055761220407031500143330ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef SPTR_HH_INCLUDED__ #define SPTR_HH_INCLUDED__ /// A generic non-intrusive smart-pointer template. We could use boost::, tr1:: /// or whatever, but since there's no standard solution yet, it isn't worth /// the dependency given the simplicity of the template template< class T > class sptr_base { template< class TT > friend class sptr_base; T * p; unsigned * count; void increment() { if ( count ) ++*count; } public: sptr_base(): p( 0 ), count( 0 ) {} sptr_base( T * p_ ): p( p_ ), count( p ? new unsigned( 1 ) : 0 ) { } sptr_base( sptr_base< T > const & other ): p( other.p ), count( other.count ) { increment(); } // TT is meant to be a derivative of T template< class TT > sptr_base( sptr_base< TT > const & other ): p( ( T * ) other.p ), count( other.count ) { increment(); } void reset() { if ( count ) { if ( ! -- *count ) { delete count; count = 0; if ( p ) { T * p_ = p; p = 0; delete p_; } } else { p = 0; count = 0; } } } unsigned use_count() const { return count; } sptr_base & operator = ( sptr_base const & other ) { if ( &other != this ) { reset(); p = other.p; count = other.count; increment(); } return * this; } bool operator ! ( void ) const { return !p; } bool operator == ( sptr_base const & other ) const { return p == other.p; } bool operator != ( sptr_base const & other ) const { return p != other.p; } ~sptr_base() { reset(); } protected: T * get_base( void ) const { return p; } }; template< class T > class sptr: public sptr_base< T > { public: sptr() {} sptr( T * p ): sptr_base< T >( p ) {} // TT is meant to be a derivative of T template< class TT > sptr( sptr< TT > const & other ): sptr_base< T >( other ) {} // Retrieval T * get( void ) const { return sptr_base< T > :: get_base(); } T * operator -> ( void ) const { return get(); } T & operator * ( void ) const { return * get(); } // Check operator bool( void ) const { return get(); } bool operator ! ( void ) const { return !get(); } }; template< class T > class const_sptr: public sptr_base< T > { public: const_sptr() {} const_sptr( T * p_ ): sptr_base< T >( p_ ) {} const_sptr( sptr< T > const & other ): sptr_base< T >( other ) {} // TT is meant to be a derivative of T template< class TT > const_sptr( sptr_base< TT > const & other ): sptr_base< T >( other ) {} // Retrieval T const * get( void ) const { return sptr_base< T > :: get_base(); } T const * operator -> ( void ) const { return get(); } T const & operator * ( void ) const { return * get(); } }; #endif zbackup-1.2/static_assert.hh000066400000000000000000000011601220407031500161740ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef STATIC_ASSERT_HH_INCLUDED__ #define STATIC_ASSERT_HH_INCLUDED__ // Based on the one from the Boost library. It wouldn't make sense to depend on // boost just for that namespace StaticAssert { template < bool > struct AssertionFailure; template <> struct AssertionFailure< true > {}; template< int > struct Test {}; } #define STATIC_ASSERT( B ) \ typedef ::StaticAssert::Test< \ sizeof( ::StaticAssert::AssertionFailure< bool( B ) > ) >\ static_assert_typedef_ ## __LINE__ #endif zbackup-1.2/storage_info_file.cc000066400000000000000000000017771220407031500170060ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include "encrypted_file.hh" #include "message.hh" #include "storage_info_file.hh" namespace StorageInfoFile { enum { FileFormatVersion = 1 }; void save( string const & fileName, StorageInfo const & storageInfo ) { EncryptedFile::OutputStream os( fileName.c_str(), EncryptionKey::noKey(), NULL ); FileHeader header; header.set_version( FileFormatVersion ); Message::serialize( header, os ); Message::serialize( storageInfo, os ); os.writeAdler32(); } void load( string const & fileName, StorageInfo & storageInfo ) { EncryptedFile::InputStream is( fileName.c_str(), EncryptionKey::noKey(), NULL ); FileHeader header; Message::parse( header, is ); if ( header.version() != FileFormatVersion ) throw exUnsupportedVersion(); Message::parse( storageInfo, is ); is.checkAdler32(); } } zbackup-1.2/storage_info_file.hh000066400000000000000000000013531220407031500170060ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef STORAGE_INFO_FILE_HH_INCLUDED__ #define STORAGE_INFO_FILE_HH_INCLUDED__ #include #include #include "encryption_key.hh" #include "ex.hh" #include "zbackup.pb.h" namespace StorageInfoFile { using std::string; DEF_EX( Ex, "Storage info file exception", std::exception ) DEF_EX( exUnsupportedVersion, "Unsupported version of the storage info file format", Ex ) /// Saves the given StorageInfo data into the given file void save( string const & fileName, StorageInfo const & ); /// Loads the given StorageInfo data from the given file void load( string const & fileName, StorageInfo & ); } #endif zbackup-1.2/tartool/000077500000000000000000000000001220407031500144715ustar00rootroot00000000000000zbackup-1.2/tartool/CMakeLists.txt000066400000000000000000000004671220407031500172400ustar00rootroot00000000000000# Copyright (c) 2012-2013 Konstantin Isakov # Part of ZBackup. Licensed under GNU GPLv2 or later cmake_minimum_required( VERSION 2.6.0 ) project( tartool ) set( CMAKE_BUILD_TYPE Release ) add_executable( tartool tartool.cc ../file.cc ../dir.cc ) install( TARGETS tartool DESTINATION bin ) zbackup-1.2/tartool/tartool.cc000066400000000000000000000117721220407031500164740ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include #include "../dir.hh" #include "../file.hh" using std::string; using std::vector; using std::map; void mention( File & file, string const & path ) { file.write( path.data(), path.size() ); file.write( '\n' ); } bool startsWith( string const & s, char const * prefix ) { for ( char const * sPtr = s.c_str(), * pPtr = prefix; *pPtr; ++sPtr, ++pPtr ) if ( *sPtr != *pPtr ) return false; return true; } void scanDirIgnoringErrors( string const & path, File & includes, File & excludes, bool currentlyIncluded ); void scanDir( string const & path, File & includes, File & excludes, bool currentlyIncluded ) { Dir::Entry entry; vector< string > subdirs; vector< string > namedIncludes, namedExcludes; typedef map< string, bool > FileList; FileList fileList; bool doBackup = false; bool dontBackup = false; for ( Dir::Listing dir( path ); dir.getNext( entry ); ) { string const & fileName = entry.getFileName(); if ( entry.isDir() ) { if ( !entry.isSymLink() ) subdirs.push_back( fileName ); } else if ( fileName == ".backup" ) doBackup = true; if ( fileName == ".no-backup" ) dontBackup = true; else if ( startsWith( fileName, ".backup-" ) ) namedIncludes.push_back( fileName.substr( 8 ) ); else if ( startsWith( fileName, ".no-backup-" ) ) namedExcludes.push_back( fileName.substr( 11 ) ); } // If both are mentioned, backup if ( doBackup ) dontBackup = false; if ( doBackup && !currentlyIncluded ) { mention( includes, path ); currentlyIncluded = true; } if ( dontBackup && currentlyIncluded ) { mention( excludes, path ); currentlyIncluded = false; } // If we have any effective named lists, build the fileList map and process // them. if ( ( !currentlyIncluded && !namedIncludes.empty() ) || ( currentlyIncluded && !namedExcludes.empty() ) ) { for ( Dir::Listing dir( path ); dir.getNext( entry ); ) fileList[ entry.getFileName() ] = entry.isDir() && !entry.isSymLink(); if ( !currentlyIncluded ) { for ( vector< string > :: const_iterator i = namedIncludes.begin(); i != namedIncludes.end(); ++i ) { FileList::iterator entry = fileList.find( *i ); if ( entry != fileList.end() ) { mention( includes, Dir::addPath( path, *i ) ); if ( entry->second ) // Is it a dir? Scan it then. scanDir( Dir::addPath( path, entry->first ), includes, excludes, true ); // Make sure we don't process it twice. fileList.erase( entry ); } else fprintf( stderr, "Warning: named include %s does not exist in %s\n", i->c_str(), path.c_str() ); } } else { for ( vector< string > :: const_iterator i = namedExcludes.begin(); i != namedExcludes.end(); ++i ) { FileList::iterator entry = fileList.find( *i ); if ( entry != fileList.end() ) { mention( excludes, Dir::addPath( path, *i ) ); if ( entry->second ) // Is it a dir? Scan it then. scanDir( Dir::addPath( path, entry->first ), includes, excludes, false ); // Make sure we don't process it twice. fileList.erase( entry ); } else fprintf( stderr, "Warning: named exclude %s does not exist in %s\n", i->c_str(), path.c_str() ); } } // Scan the rest of dirs for ( FileList::const_iterator i = fileList.begin(); i != fileList.end(); ++i ) if ( i->second ) scanDirIgnoringErrors( Dir::addPath( path, i->first ), includes, excludes, currentlyIncluded ); } else { // No named lists -- just process all the dirs for ( size_t x = 0; x < subdirs.size(); ++x ) scanDirIgnoringErrors( Dir::addPath( path, subdirs[ x ] ), includes, excludes, currentlyIncluded ); } } void scanDirIgnoringErrors( string const & path, File & includes, File & excludes, bool currentlyIncluded ) { try { scanDir( path, includes, excludes, currentlyIncluded ); } catch( Dir::exCantList & e ) { fprintf( stderr, "Warning: %s\n", e.what() ); } } int main( int argc, char *argv[] ) { if ( argc != 4 ) { fprintf( stderr, "Usage: %s \n", *argv ); return EXIT_FAILURE; } try { File includes( argv[ 2 ], File::WriteOnly ); File excludes( argv[ 3 ], File::WriteOnly ); scanDir( argv[ 1 ], includes, excludes, false ); return EXIT_SUCCESS; } catch( std::exception & e ) { fprintf( stderr, "Error: %s\n", e.what() ); return EXIT_FAILURE; } } zbackup-1.2/tests/000077500000000000000000000000001220407031500141475ustar00rootroot00000000000000zbackup-1.2/tests/TODO.txt000066400000000000000000000000751220407031500154570ustar00rootroot00000000000000Convert those to cmake -- they still use qmake at the moment zbackup-1.2/tests/encrypted_file/000077500000000000000000000000001220407031500171435ustar00rootroot00000000000000zbackup-1.2/tests/encrypted_file/encrypted_file.pro000066400000000000000000000016451220407031500226670ustar00rootroot00000000000000###################################################################### # Automatically generated by qmake (2.01a) Sun Jul 14 20:54:52 2013 ###################################################################### TEMPLATE = app TARGET = DEPENDPATH += . INCLUDEPATH += . LIBS += -lcrypto -lprotobuf -lz DEFINES += __STDC_FORMAT_MACROS # Input SOURCES += test_encrypted_file.cc \ ../../unbuffered_file.cc \ ../../tmp_mgr.cc \ ../../page_size.cc \ ../../random.cc \ ../../encryption_key.cc \ ../../encryption.cc \ ../../encrypted_file.cc \ ../../file.cc \ ../../dir.cc \ ../../zbackup.pb.cc HEADERS += \ ../../unbuffered_file.hh \ ../../tmp_mgr.hh \ ../../adler32.hh \ ../../page_size.hh \ ../../random.hh \ ../../encryption_key.hh \ ../../encrypted_file.hh \ ../../encryption.hh \ ../../ex.hh \ ../../file.hh \ ../../dir.hh \ ../../zbackup.pb.h zbackup-1.2/tests/encrypted_file/test_encrypted_file.cc000066400000000000000000000111231220407031500235030ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include "../../encrypted_file.hh" #include "../../encryption_key.hh" #include "../../random.hh" #include "../../tmp_mgr.hh" #include "../../check.hh" #include "../../adler32.hh" char rnd[ 16384 ]; Adler32::Value adler( int sz ) { Adler32 a; a.add( rnd, sz ); return a.result(); } void readAndWrite( EncryptionKey const & key, bool writeBackups, bool readBackups, bool readSkips ) { TmpMgr tmpMgr( "/dev/shm" ); sptr< TemporaryFile > tempFile = tmpMgr.makeTemporaryFile(); int fileSize = rand() % ( sizeof( rnd ) + 1 ); fprintf( stderr, "Run with %d bytes, %s%s%s%sfile %s...\n", fileSize, key.hasKey() ? "" : "no encryption, ", writeBackups ? "write backups, " : "", readBackups ? "read backups, " : "", readSkips ? "read skips, " : "", tempFile->getFileName().c_str() ); char iv[ Encryption::IvSize ]; Random::genaratePseudo( iv, sizeof( iv ) ); // Write { EncryptedFile::OutputStream out( tempFile->getFileName().c_str(), key, iv ); char const * next = rnd; int avail = 0; for ( int left = fileSize; left; ) { CHECK( out.ByteCount() == fileSize - left, "Incorrect bytecount in the " "middle of writing" ); void * data; CHECK( out.Next( &data, &avail ), "out.Next() returned false" ); CHECK( avail > 0, "out.Next() returned zero size" ); bool doBackup = writeBackups && ( rand() & 1 ); int backup; if ( doBackup ) { backup = rand() % ( avail + 1 ); // Make sure we don't back up and then need to back up again to finish // the write if ( avail > left ) backup = avail - left; avail -= backup; } int toWrite = avail > left ? left : avail; memcpy( data, next, toWrite ); if ( doBackup ) out.BackUp( backup ); next += toWrite; left -= toWrite; avail -= toWrite; if ( !avail && ( rand() & 1 ) ) { CHECK( adler( next - rnd ) == out.getAdler32(), "bad adler32 in the middle of writing" ); } } if ( avail || ( rand() & 1 ) ) out.BackUp( avail ); CHECK( out.ByteCount() == fileSize, "Incorrect bytecount after writing" ); if ( rand() & 1 ) { CHECK( adler( fileSize ) == out.getAdler32(), "bad adler32 of the written file" ); } } // Read back { EncryptedFile::InputStream in( tempFile->getFileName().c_str(), key, iv ); char const * next = rnd; void const * data; int avail = 0; for ( int left = fileSize; left; ) { if ( readSkips && ( rand() & 1 ) ) { int toSkip = rand() % ( left + 1 ); in.Skip( toSkip ); next += toSkip; left -= toSkip; avail = 0; continue; } CHECK( in.ByteCount() == fileSize - left, "Incorrect bytecount in the " "middle of reading" ); CHECK( in.Next( &data, &avail ), "file ended while %d were still left", left ); CHECK( avail > 0, "in.Next() returned zero size" ); bool doBackup = readBackups && ( rand() & 1 ); int backup; if ( doBackup ) { backup = rand() % ( avail + 1 ); avail -= backup; } int toRead = avail > left ? left : avail; CHECK( memcmp( next, data, toRead ) == 0, "Different bytes read than " "expected at offset %d", int( next - rnd ) ); if ( doBackup ) in.BackUp( backup ); next += toRead; left -= toRead; avail -= toRead; if ( !avail && ( rand() & 1 ) ) { CHECK( adler( next - rnd ) == in.getAdler32(), "bad adler32 in the middle of the reading" ); } } CHECK( in.ByteCount() == fileSize, "Incorrect bytecount after reading" ); CHECK( !avail, "at least %d bytes still available", avail ); CHECK( !in.Next( &data, &avail ), "file should have ended but resulted in " "%d more bytes", avail ); if ( rand() & 1 ) { CHECK( adler( fileSize ) == in.getAdler32(), "bad adler32 of the read file" ); } } } int main() { Random::genaratePseudo( rnd, sizeof( rnd ) ); EncryptionKeyInfo keyInfo; EncryptionKey::generate( "blah", keyInfo ); EncryptionKey key( "blah", &keyInfo ); EncryptionKey noKey( std::string(), NULL ); for ( size_t iteration = 100000; iteration--; ) readAndWrite( ( rand() & 1 ) ? key : noKey, rand() & 1, rand() & 1, rand() & 1 ); } zbackup-1.2/tests/rolling_hash/000077500000000000000000000000001220407031500166205ustar00rootroot00000000000000zbackup-1.2/tests/rolling_hash/rolling_hash.pro000066400000000000000000000006241220407031500220150ustar00rootroot00000000000000###################################################################### # Automatically generated by qmake (2.01a) ?? ???. 8 14:05:16 2012 ###################################################################### TEMPLATE = app TARGET = DEPENDPATH += . INCLUDEPATH += . LIBS += -lcrypto # Input SOURCES += test_rolling_hash.cc ../../rolling_hash.cc \ ../../random.cc HEADERS += \ ../../random.hh zbackup-1.2/tests/rolling_hash/test_rolling_hash.cc000066400000000000000000000064471220407031500226520ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include #include #include "../../rolling_hash.hh" #include "../../random.hh" using std::vector; using std::map; using std::set; using std::pair; using std::make_pair; int main() { // Generate a buffer with random data, then pick slices there and try // different strategies of rolling to them vector< char > data( 65536 ); Random::genaratePseudo( data.data(), data.size() ); for ( unsigned iteration = 0; iteration < 5000; ++iteration ) { unsigned sliceBegin = rand() % data.size(); unsigned sliceSize = 1 + ( rand() % ( data.size() - sliceBegin ) ); // Calculate the hash by roll-ins only uint64_t rollIns; { RollingHash hash; for ( unsigned x = 0; x < sliceSize; ++x ) hash.rollIn( data[ sliceBegin + x ] ); rollIns = hash.digest(); } // Calculate the hash by rolling-in from the beginning of data to sliceSize, // then rotating to sliceBegin uint64_t rotates; { RollingHash hash; for ( unsigned x = 0; x < sliceSize; ++x ) hash.rollIn( data[ x ] ); for ( unsigned x = 0; x < sliceBegin; ++x ) hash.rotate( data[ sliceSize + x ], data[ x ] ); rotates = hash.digest(); } if ( rollIns != rotates ) { fprintf( stderr, "Error in iteration %u: %016lx vs %016lx\n", iteration, rollIns, rotates ); return EXIT_FAILURE; } printf( "Iteration %u: %016lx\n", iteration, rollIns ); } fprintf( stderr, "Rolling hash test produced equal results\n" ); // Test collisions // Maps the hash to the ranges. Ideally each hash should be mapped to a // single range map< uint64_t, set< pair< unsigned, unsigned > > > collisions; size_t collisionValuesCount = 0; for ( unsigned iteration = 0; iteration < 500000; ++iteration ) { unsigned sliceBegin = rand() % ( data.size() - 7 ); // A minimum of 16 should be enough to ensure every unique slice corresponds // to a unique random sequence with a very high probability unsigned sliceSize = 16 + ( rand() % ( data.size() - sliceBegin ) ); // Calculate the hash by roll-ins (fastest) uint64_t rollIns; { RollingHash hash; for ( unsigned x = 0; x < sliceSize; ++x ) hash.rollIn( data[ sliceBegin + x ] ); rollIns = hash.digest(); } if ( collisions[ rollIns ].insert( make_pair( sliceBegin, sliceSize ) ).second ) ++collisionValuesCount; if ( ! ( ( iteration + 1 ) % 1000 ) ) printf( "Iteration %u: %016lx\n", iteration, rollIns ); } size_t collisionsFound = collisionValuesCount - collisions.size(); double collisionsPercentage = double( collisionsFound ) * 100 / collisionValuesCount; fprintf( stderr, "Collisions: %.04f%% (%zu in %zu)\n", collisionsPercentage, collisionsFound, collisionValuesCount ); if ( collisionsFound ) { // The probability of a collision in 500000 hashes is one to ~6 billions fprintf( stderr, "Found a collision, which should be highly unlikely\n" ); return EXIT_FAILURE; } fprintf( stderr, "Rolling hash test succeeded\n" ); return EXIT_SUCCESS; } zbackup-1.2/tmp_mgr.cc000066400000000000000000000023321220407031500147610ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include "tmp_mgr.hh" #include #include #include "dir.hh" #include "file.hh" TemporaryFile::TemporaryFile( string const & fileName ): fileName( fileName ) { } void TemporaryFile::moveOverTo( string const & destinationFileName, bool mayOverwrite ) { if ( !mayOverwrite && File::exists( destinationFileName ) ) throw TmpMgr::exWontOverwrite( destinationFileName ); File::rename( fileName, destinationFileName ); fileName.clear(); } TemporaryFile::~TemporaryFile() { if ( !fileName.empty() ) File::erase( fileName ); } string const & TemporaryFile::getFileName() const { return fileName; } TmpMgr::TmpMgr( string const & path ): path( path ) { if ( !Dir::exists( path ) ) Dir::create( path ); } sptr< TemporaryFile > TmpMgr::makeTemporaryFile() { string name( Dir::addPath( path, "XXXXXX") ); int fd = mkstemp( &name[ 0 ] ); if ( fd == -1 || close( fd ) != 0 ) throw exCantCreate( path ); return new TemporaryFile( name ); } TmpMgr::~TmpMgr() { try { Dir::remove( path ); } catch( Dir::exCantRemove & ) { } } zbackup-1.2/tmp_mgr.hh000066400000000000000000000034671220407031500150050ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef TMP_MGR_HH_INCLUDED__ #define TMP_MGR_HH_INCLUDED__ #include #include #include "dir.hh" #include "ex.hh" #include "file.hh" #include "nocopy.hh" #include "sptr.hh" /// A temporary file class TemporaryFile: NoCopy { public: /// Returns the temporary file's file name. The file may already be existent - /// it is supposed to be overwritten then string const & getFileName() const; /// Renames this temporary file over the given file name. If the destination /// file exists already, it gets replaced if mayOverwrite is true, or throws /// an exception otherwise void moveOverTo( string const & destinationFileName, bool mayOverwrite = false ); /// Removes the file from the disk, unless moveOverTo() was called previously ~TemporaryFile(); private: /// Use TmpMgr::makeTemporaryFile() instead of this constructor TemporaryFile( string const & fileName ); string fileName; friend class TmpMgr; }; /// Allows creating temporary files and later either removing them or moving /// them over to the target ones class TmpMgr: NoCopy { string path; public: DEF_EX( Ex, "Temporary file manager exception", std::exception ) DEF_EX_STR( exCantCreate, "Can't create a temporary file in dir", Ex ) DEF_EX_STR( exWontOverwrite, "Won't overwrite existing file", Ex ) /// Creates the given directory if it doesn't exist already and uses it to /// store temporary files. TmpMgr( string const & path ); /// Creates an new empty temporary file and returns its full file name, /// including the path. The file is then supposed to be overwritten sptr< TemporaryFile > makeTemporaryFile(); /// Removes the temporary directory, if possible ~TmpMgr(); }; #endif zbackup-1.2/unbuffered_file.cc000066400000000000000000000041461220407031500164450ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #define _LARGEFILE64_SOURCE #include #include #include #include #include #include "check.hh" #include "unbuffered_file.hh" #ifdef __APPLE__ #define lseek64 lseek #endif UnbufferedFile::UnbufferedFile( char const * fileName, Mode mode ) throw( exCantOpen ) { int flags = ( mode == WriteOnly ? ( O_WRONLY | O_CREAT | O_TRUNC ) : O_RDONLY ); #ifndef __APPLE__ flags |= O_LARGEFILE; #endif fd = open( fileName, flags, 0666 ); if ( fd < 0 ) throw exCantOpen( fileName ); } size_t UnbufferedFile::read( void * buf, size_t size ) throw( exReadError ) { char * next = ( char * ) buf; size_t left = size; while( left ) { ssize_t rd = ::read( fd, next, left ); if ( rd < 0 ) { if ( errno != EINTR ) throw exReadError(); } else if ( rd > 0 ) { CHECK( ( size_t ) rd <= left, "read too many bytes from a file" ); next += rd; left -= rd; } else break; } return size - left; } void UnbufferedFile::write( void const * buf, size_t size ) throw( exWriteError ) { char const * next = ( char const * ) buf; size_t left = size; while( left ) { ssize_t written = ::write( fd, next, left ); if ( written < 0 ) { if ( errno != EINTR ) throw exWriteError(); } else { CHECK( ( size_t ) written <= left, "wrote too many bytes to a file" ); next += written; left -= written; } } } UnbufferedFile::Offset UnbufferedFile::size() throw( exSeekError ) { Offset cur = lseek64( fd, 0, SEEK_CUR ); if ( cur < 0 ) throw exSeekError(); Offset result = lseek64( fd, 0, SEEK_END ); if ( result < 0 || lseek64( fd, cur, SEEK_SET ) < 0 ) throw exSeekError(); return result; } void UnbufferedFile::seekCur( Offset offset ) throw( exSeekError ) { if ( lseek64( fd, offset, SEEK_CUR ) < 0 ) throw exSeekError(); } UnbufferedFile::~UnbufferedFile() throw() { close( fd ); } zbackup-1.2/unbuffered_file.hh000066400000000000000000000033431220407031500164550ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef UNBUFFERED_FILE_HH_INCLUDED__ #define UNBUFFERED_FILE_HH_INCLUDED__ #include #include #include #include #include "ex.hh" #include "nocopy.hh" /// A file which does not employ its own buffering. /// TODO: add support for memory-mapped I/O, with the interface which would look /// like that of a zero-copy stream. However, since we can do encryption in- /// place, both interfaces should be available - when there's no memory-mapped /// I/O available, the user should still provide its own buffer (and then do /// in-place encryption in it). class UnbufferedFile: NoCopy { public: DEF_EX( Ex, "Unbuffered file exception", std::exception ) DEF_EX_STR( exCantOpen, "Can't open file", Ex ) DEF_EX( exReadError, "File read error", Ex ) DEF_EX( exWriteError, "File write error", Ex ) DEF_EX( exSeekError, "File seek error", Ex ) enum Mode { ReadOnly, WriteOnly }; typedef int64_t Offset; /// Opens the given file UnbufferedFile( char const * fileName, Mode ) throw( exCantOpen ); /// Reads up to 'size' bytes into the buffer. Returns the number of bytes /// read. If the value returned is less than the 'size' provided, the end of /// file was reached size_t read( void * buf, size_t size ) throw( exReadError ); /// Writes 'size' bytes void write( void const * buf, size_t size ) throw( exWriteError ); /// Returns file size Offset size() throw( exSeekError ); /// Seeks to the given offset, relative to the current file offset void seekCur( Offset ) throw( exSeekError ); ~UnbufferedFile() throw(); private: int fd; }; #endif zbackup-1.2/zbackup.cc000066400000000000000000000311661220407031500147620ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #include #include #include #include #include #include #include #include #include "backup_creator.hh" #include "backup_file.hh" #include "backup_restorer.hh" #include "debug.hh" #include "dir.hh" #include "encryption_key.hh" #include "ex.hh" #include "file.hh" #include "mt.hh" #include "sha256.hh" #include "sptr.hh" #include "storage_info_file.hh" #include "zbackup.hh" using std::vector; Paths::Paths( string const & storageDir ): storageDir( storageDir ) { } string Paths::getTmpPath() { return string( Dir::addPath( storageDir, "tmp" ) ); } string Paths::getBundlesPath() { return string( Dir::addPath( storageDir, "bundles" ) ); } string Paths::getStorageInfoPath() { return string( Dir::addPath( storageDir, "info" ) ); } string Paths::getIndexPath() { return string( Dir::addPath( storageDir, "index" ) ); } string Paths::getBackupsPath() { return string( Dir::addPath( storageDir, "backups" ) ); } ZBackupBase::ZBackupBase( string const & storageDir, string const & password ): Paths( storageDir ), storageInfo( loadStorageInfo() ), encryptionkey( password, storageInfo.has_encryption_key() ? &storageInfo.encryption_key() : 0 ), tmpMgr( getTmpPath() ), chunkIndex( encryptionkey, tmpMgr, getIndexPath() ) { } StorageInfo ZBackupBase::loadStorageInfo() { StorageInfo storageInfo; StorageInfoFile::load( getStorageInfoPath(), storageInfo ); return storageInfo; } void ZBackupBase::initStorage( string const & storageDir, string const & password, bool isEncrypted ) { StorageInfo storageInfo; // TODO: make the following configurable storageInfo.set_chunk_max_size( 65536 ); storageInfo.set_bundle_max_payload_size( 0x200000 ); if ( isEncrypted ) EncryptionKey::generate( password, *storageInfo.mutable_encryption_key() ); Paths paths( storageDir ); if ( !Dir::exists( storageDir ) ) Dir::create( storageDir ); if ( !Dir::exists( paths.getBundlesPath() ) ) Dir::create( paths.getBundlesPath() ); if ( !Dir::exists( paths.getBackupsPath() ) ) Dir::create( paths.getBackupsPath() ); if ( !Dir::exists( paths.getIndexPath() ) ) Dir::create( paths.getIndexPath() ); string storageInfoPath( paths.getStorageInfoPath() ); if ( File::exists( storageInfoPath ) ) throw exWontOverwrite( storageInfoPath ); StorageInfoFile::save( storageInfoPath, storageInfo ); } string ZBackupBase::deriveStorageDirFromBackupsFile( string const & backupsFile ) { // TODO: handle cases when there's a backup/ folder within the backup/ folder // correctly string realPath = Dir::getRealPath( Dir::getDirName( backupsFile ) ); size_t pos; if ( realPath.size() >= 8 && strcmp( realPath.c_str() + realPath.size() - 8, "/backups") == 0 ) pos = realPath.size() - 8; else pos = realPath.rfind( "/backups/" ); if ( pos == string::npos ) throw exCantDeriveStorageDir( backupsFile ); else return realPath.substr( 0, pos ); } ZBackup::ZBackup( string const & storageDir, string const & password, size_t threads ): ZBackupBase( storageDir, password ), chunkStorageWriter( storageInfo, encryptionkey, tmpMgr, chunkIndex, getBundlesPath(), getIndexPath(), threads ) { } void ZBackup::backupFromStdin( string const & outputFileName ) { if ( isatty( fileno( stdin ) ) ) throw exWontReadFromTerminal(); if ( File::exists( outputFileName ) ) throw exWontOverwrite( outputFileName ); Sha256 sha256; BackupCreator backupCreator( storageInfo, chunkIndex, chunkStorageWriter ); time_t startTime = time( 0 ); uint64_t totalDataSize = 0; for ( ; ; ) { size_t toRead = backupCreator.getInputBufferSize(); // dPrintf( "Reading up to %u bytes from stdin\n", toRead ); void * inputBuffer = backupCreator.getInputBuffer(); size_t rd = fread( inputBuffer, 1, toRead, stdin ); if ( !rd ) { if ( feof( stdin ) ) { dPrintf( "No more input on stdin\n" ); break; } else throw exStdinError(); } sha256.add( inputBuffer, rd ); backupCreator.handleMoreData( rd ); totalDataSize += rd; } // Finish up with the creator backupCreator.finish(); string serialized; backupCreator.getBackupData( serialized ); BackupInfo info; info.set_sha256( sha256.finish() ); info.set_size( totalDataSize ); // Shrink the serialized data iteratively until it wouldn't shrink anymore for ( ; ; ) { BackupCreator backupCreator( storageInfo, chunkIndex, chunkStorageWriter ); char const * ptr = serialized.data(); size_t left = serialized.size(); while( left ) { size_t bufferSize = backupCreator.getInputBufferSize(); size_t toCopy = bufferSize > left ? left : bufferSize; memcpy( backupCreator.getInputBuffer(), ptr, toCopy ); backupCreator.handleMoreData( toCopy ); ptr += toCopy; left -= toCopy; } backupCreator.finish(); string newGen; backupCreator.getBackupData( newGen ); if ( newGen.size() < serialized.size() ) { serialized.swap( newGen ); info.set_iterations( info.iterations() + 1 ); } else break; } dPrintf( "Iterations: %u\n", info.iterations() ); info.mutable_backup_data()->swap( serialized ); info.set_time( time( 0 ) - startTime ); // Commit the bundles to the disk before creating the final output file chunkStorageWriter.commit(); // Now save the resulting BackupInfo sptr< TemporaryFile > tmpFile = tmpMgr.makeTemporaryFile(); BackupFile::save( tmpFile->getFileName(), encryptionkey, info ); tmpFile->moveOverTo( outputFileName ); } ZRestore::ZRestore( string const & storageDir, string const & password, size_t cacheSize ): ZBackupBase( storageDir, password ), chunkStorageReader( storageInfo, encryptionkey, chunkIndex, getBundlesPath(), cacheSize ) { } void ZRestore::restoreToStdin( string const & inputFileName ) { if ( isatty( fileno( stdout ) ) ) throw exWontWriteToTerminal(); BackupInfo backupInfo; BackupFile::load( inputFileName, encryptionkey, backupInfo ); string backupData; // Perform the iterations needed to get to the actual user backup data for ( ; ; ) { backupData.swap( *backupInfo.mutable_backup_data() ); if ( backupInfo.iterations() ) { struct StringWriter: public DataSink { string result; virtual void saveData( void const * data, size_t size ) { result.append( ( char const * ) data, size ); } } stringWriter; BackupRestorer::restore( chunkStorageReader, backupData, stringWriter ); backupInfo.mutable_backup_data()->swap( stringWriter.result ); backupInfo.set_iterations( backupInfo.iterations() - 1 ); } else break; } struct StdoutWriter: public DataSink { Sha256 sha256; virtual void saveData( void const * data, size_t size ) { sha256.add( data, size ); if ( fwrite( data, size, 1, stdout ) != 1 ) throw exStdoutError(); } } stdoutWriter; BackupRestorer::restore( chunkStorageReader, backupData, stdoutWriter ); if ( stdoutWriter.sha256.finish() != backupInfo.sha256() ) throw exChecksumError(); } DEF_EX( exNonEncryptedWithKey, "--non-encrypted and --password-file are incompatible", std::exception ) DEF_EX( exSpecifyEncryptionOptions, "Specify either --password-file or --non-encrypted", std::exception ) DEF_EX_STR( exInvalidThreadsValue, "Invalid threads value specified:", std::exception ) int main( int argc, char *argv[] ) { try { char const * passwordFile = 0; bool nonEncrypted = false; size_t const defaultThreads = getNumberOfCpus(); size_t threads = defaultThreads; size_t const defaultCacheSizeMb = 40; size_t cacheSizeMb = defaultCacheSizeMb; vector< char const * > args; for( int x = 1; x < argc; ++x ) { if ( strcmp( argv[ x ], "--password-file" ) == 0 && x + 1 < argc ) { passwordFile = argv[ x + 1 ]; ++x; } else if ( strcmp( argv[ x ], "--non-encrypted" ) == 0 ) nonEncrypted = true; else if ( strcmp( argv[ x ], "--silent" ) == 0 ) verboseMode = false; else if ( strcmp( argv[ x ], "--threads" ) == 0 && x + 1 < argc ) { int n; if ( sscanf( argv[ x + 1 ], "%zu %n", &threads, &n ) != 1 || argv[ x + 1 ][ n ] || threads < 1 ) throw exInvalidThreadsValue( argv[ x + 1 ] ); ++x; } else if ( strcmp( argv[ x ], "--cache-size" ) == 0 && x + 1 < argc ) { char suffix[ 16 ]; int n; if ( sscanf( argv[ x + 1 ], "%zu %15s %n", &cacheSizeMb, suffix, &n ) == 2 && !argv[ x + 1 ][ n ] ) { // Check the suffix for ( char * c = suffix; *c; ++c ) *c = tolower( *c ); if ( strcmp( suffix, "mb" ) != 0 ) { fprintf( stderr, "Invalid suffix specified in cache size: %s. " "The only supported suffix is 'mb' for megabytes\n", argv[ x + 1 ] ); return EXIT_FAILURE; } ++x; } else { fprintf( stderr, "Invalid cache size value specified: %s. " "Must be a number with the 'mb' suffix, e.g. '100mb'\n", argv[ x + 1 ] ); return EXIT_FAILURE; } } else args.push_back( argv[ x ] ); } if ( nonEncrypted && passwordFile ) throw exNonEncryptedWithKey(); if ( args.size() < 1 ) { fprintf( stderr, "ZBackup, a versatile deduplicating backup tool, version 1.2\n" "Copyright (c) 2012-2013 Konstantin Isakov \n" "Comes with no warranty. Licensed under GNU GPLv2 or later.\n" "Visit the project's home page at http://zbackup.org/\n\n" "Usage: %s [flags] [command args]\n" " Flags: --non-encrypted|--password-file \n" " --silent (default is verbose)\n" " --threads (default is %zu on your system)\n" " --cache-size MB (default is %zu)\n" " Commands:\n" " init - initializes new storage;\n" " backup - performs a backup from stdin;\n" " restore - restores a backup to stdout.\n", *argv, defaultThreads, defaultCacheSizeMb ); return EXIT_FAILURE; } // Read the password string passwordData; if ( passwordFile ) { File f( passwordFile, File::ReadOnly ); passwordData.resize( f.size() ); f.read( &passwordData[ 0 ], passwordData.size() ); // If the password ends with \n, remove that last \n. Many editors will // add \n there even if a user doesn't want them to if ( !passwordData.empty() && passwordData[ passwordData.size() - 1 ] == '\n' ) passwordData.resize( passwordData.size() - 1 ); } if ( strcmp( args[ 0 ], "init" ) == 0 ) { // Perform the init if ( args.size() != 2 ) { fprintf( stderr, "Usage: %s init \n", *argv ); return EXIT_FAILURE; } if ( !nonEncrypted && !passwordFile ) throw exSpecifyEncryptionOptions(); ZBackup::initStorage( args[ 1 ], passwordData, !nonEncrypted ); } else if ( strcmp( args[ 0 ], "backup" ) == 0 ) { // Perform the backup if ( args.size() != 2 ) { fprintf( stderr, "Usage: %s backup \n", *argv ); return EXIT_FAILURE; } ZBackup zb( ZBackup::deriveStorageDirFromBackupsFile( args[ 1 ] ), passwordData, threads ); zb.backupFromStdin( args[ 1 ] ); } else if ( strcmp( args[ 0 ], "restore" ) == 0 ) { // Perform the restore if ( args.size() != 2 ) { fprintf( stderr, "Usage: %s restore \n", *argv ); return EXIT_FAILURE; } ZRestore zr( ZRestore::deriveStorageDirFromBackupsFile( args[ 1 ] ), passwordData, cacheSizeMb * 1048576 ); zr.restoreToStdin( args[ 1 ] ); } else { fprintf( stderr, "Error: unknown command line option: %s\n", args[ 0 ] ); return EXIT_FAILURE; } } catch( std::exception & e ) { fprintf( stderr, "%s\n", e.what() ); return EXIT_FAILURE; } return EXIT_SUCCESS; } zbackup-1.2/zbackup.hh000066400000000000000000000050651220407031500147730ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later #ifndef ZBACKUP_HH_INCLUDED__ #define ZBACKUP_HH_INCLUDED__ #include #include #include #include #include "chunk_id.hh" #include "chunk_index.hh" #include "chunk_storage.hh" #include "encryption_key.hh" #include "ex.hh" #include "tmp_mgr.hh" #include "zbackup.pb.h" using std::string; using std::vector; struct Paths { string storageDir; Paths( string const & storageDir ); string getTmpPath(); string getRestorePath(); string getCreatePath(); string getBundlesPath(); string getStorageInfoPath(); string getIndexPath(); string getBackupsPath(); }; class ZBackupBase: protected Paths { public: DEF_EX( Ex, "ZBackup exception", std::exception ) DEF_EX_STR( exWontOverwrite, "Won't overwrite existing file", Ex ) DEF_EX( exStdinError, "Error reading from standard input", Ex ) DEF_EX( exWontReadFromTerminal, "Won't read data from a terminal", exStdinError ) DEF_EX( exStdoutError, "Error writing to standard output", Ex ) DEF_EX( exWontWriteToTerminal, "Won't write data to a terminal", exStdoutError ) DEF_EX( exSerializeError, "Failed to serialize data", Ex ) DEF_EX( exParseError, "Failed to parse data", Ex ) DEF_EX( exChecksumError, "Checksum error", Ex ) DEF_EX_STR( exCantDeriveStorageDir, "The path must be within the backups/ dir:", Ex ) /// Opens the storage ZBackupBase( string const & storageDir, string const & password ); /// Creates new storage static void initStorage( string const & storageDir, string const & password, bool isEncrypted ); /// For a given file within the backups/ dir in the storage, returns its /// storage dir or throws an exception static string deriveStorageDirFromBackupsFile( string const & backupsFile ); protected: StorageInfo storageInfo; EncryptionKey encryptionkey; TmpMgr tmpMgr; ChunkIndex chunkIndex; private: StorageInfo loadStorageInfo(); }; class ZBackup: public ZBackupBase { ChunkStorage::Writer chunkStorageWriter; public: ZBackup( string const & storageDir, string const & password, size_t threads ); /// Backs up the data from stdin void backupFromStdin( string const & outputFileName ); }; class ZRestore: public ZBackupBase { ChunkStorage::Reader chunkStorageReader; public: ZRestore( string const & storageDir, string const & password, size_t cacheSize ); /// Restores the data to stdin void restoreToStdin( string const & inputFileName ); }; #endif zbackup-1.2/zbackup.proto000066400000000000000000000071711220407031500155370ustar00rootroot00000000000000// Copyright (c) 2012-2013 Konstantin Isakov // Part of ZBackup. Licensed under GNU GPLv2 or later // Protobuffers used in zbackup // This stores the key used for the encryption of all the blocks. The key itself // is stored in the encrypted form. A user supplies a password - it is used // together with salt and rounds to generate a decryption key for the actual // key used for block encryption. This way we can change the password without // re-encrypting all the blocks message EncryptionKeyInfo { // The decryption key is derived from the password, salt and rounds using // PKCS5_PBKDF2_HMAC_SHA1 // Salt to use together with the user password required bytes salt = 1; // Rounds of hashing to apply when generating the key used to decrypt the // block key required uint32 rounds = 2; // Stores the block encryption key, in an encrypted form itself required bytes encrypted_key = 3; // Used to check that the key was decrypted correctly - see the next field required bytes key_check_input = 4; // HMAC of key_check_input using the decrypted key. Used to check that the // key was indeed decrypted correctly required bytes key_check_hmac = 5; } message StorageInfo { // Maximum chunk size used when storing chunks required uint32 chunk_max_size = 1; // Maximum number of bytes a bundle can hold. Only real chunk bytes are // counted, not metadata. Any bundle should be able to contain at least // one arbitrary single chunk, so this should not be smaller than // chunk_max_size required uint32 bundle_max_payload_size = 2; // If present, used for encryption/decryption of all data optional EncryptionKeyInfo encryption_key = 3; } message BundleInfo { // Info about a single chunk stored message ChunkRecord { // Id of the chunk required bytes id = 1; // Size of the chunk required uint32 size = 2; } // A sequence of chunk records repeated ChunkRecord chunk_record = 1; } message FileHeader { // File format version required uint32 version = 1; } message IndexBundleHeader { // Id of the bundle following in the stream. If not present, indicates the // end of log file optional bytes id = 1; } // A single instruction. Backups are made of a sequence of those instructions, // which are executed one after another message BackupInstruction { // Both fields can present simultaneously. They are evaluated in the same // order they are listed here // If present, the chunk with that id should be emitted to the data flow optional bytes chunk_to_emit = 1; // If present, the bytes contained in the field should be emitted to the // data flow optional bytes bytes_to_emit = 2; } message BackupInfo { // The backup data. Since usually the field is quite large for real life /// backups, we process its serialized data with the same backup algorithm // iteratively until it doesn't shrink. The content of this field represents // the last iteration of that process. If iterations = 0, it directly // represents the user's backup data. If iterations = 1, it represents the // backed up BackupData which would represent the user's backed up data once // it is restored, and so on. // The type is 'bytes' as the result is serialized required bytes backup_data = 1; // Number of times backup_data should be restored with the 'restore' algorithm // before we get what we need to restore for the end user optional uint32 iterations = 2 [default = 0]; // Number of bytes in the backup data required uint64 size = 3; // SHA-256 of the original data required bytes sha256 = 4; // Time spent creating the backup, in seconds optional int64 time = 5; }