From 93d778bdcb108cfe29ff576946e9703f05596953 Mon Sep 17 00:00:00 2001
From: Hatter Jiang <jht5945@gmail.com>
Date: Fri, 20 Oct 2023 00:05:22 +0800
Subject: [PATCH] feat: init commit, copied from crate chacha20-poly-aead

---
 .gitignore                            |   1 +
 Cargo.toml                            |  20 +
 LICENSE-APACHE                        | 202 ++++++++++
 LICENSE-MIT                           |  19 +
 README-from-chacha20-poly1305-aead.md |  75 ++++
 src/aead.rs                           | 476 +++++++++++++++++++++++
 src/as_bytes.rs                       |  43 +++
 src/chacha20.rs                       | 250 ++++++++++++
 src/lib.rs                            |  60 +++
 src/poly1305.rs                       | 536 ++++++++++++++++++++++++++
 src/simd.rs                           | 110 ++++++
 src/simd_opt/mod.rs                   |  44 +++
 src/simd_opt/u32x4.rs                 |  71 ++++
 src/simdint.rs                        |  20 +
 src/simdop.rs                         |  93 +++++
 src/simdty.rs                         |  59 +++
 16 files changed, 2079 insertions(+)
 create mode 100644 Cargo.toml
 create mode 100644 LICENSE-APACHE
 create mode 100644 LICENSE-MIT
 create mode 100644 README-from-chacha20-poly1305-aead.md
 create mode 100644 src/aead.rs
 create mode 100644 src/as_bytes.rs
 create mode 100644 src/chacha20.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/poly1305.rs
 create mode 100644 src/simd.rs
 create mode 100644 src/simd_opt/mod.rs
 create mode 100644 src/simd_opt/u32x4.rs
 create mode 100644 src/simdint.rs
 create mode 100644 src/simdop.rs
 create mode 100644 src/simdty.rs

diff --git a/.gitignore b/.gitignore
index 3bf25c0..409abaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.idea/
 # ---> Rust
 # Generated by Cargo
 # will have compiled files and executables
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..5aad182
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "chacha20-poly1305-stream"
+version = "0.1.0"
+edition = "2021"
+authors = ["Cesar Eduardo Barros <cesarb@cesarb.eti.br>", "Hatter Jiang <jht5945@gmail.com>"]
+description = "A pure Rust implementation of the ChaCha20-Poly1305 AEAD from RFC 7539."
+repository = "https://git.hatter.ink/hatter/chacha20-poly1305-stream"
+readme = "README.md"
+keywords = ["chacha20", "poly1305", "aead", "crypto"]
+license = "MIT OR Apache-2.0"
+
+[features]
+bench = []
+simd = []
+simd_opt = ["simd"]
+simd_asm = ["simd_opt"]
+
+[dependencies]
+constant_time_eq = "0.1.0"
+clippy = { version = "0.0.37", optional = true }
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..8f71f43
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..7948117
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,19 @@
+Copyright (c) 2015 The blake2-rfc Developers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README-from-chacha20-poly1305-aead.md b/README-from-chacha20-poly1305-aead.md
new file mode 100644
index 0000000..aa51760
--- /dev/null
+++ b/README-from-chacha20-poly1305-aead.md
@@ -0,0 +1,75 @@
+This is a pure Rust implementation of the ChaCha20-Poly1305 AEAD from
+[RFC 7539].
+
+[RFC 7539]: https://tools.ietf.org/html/rfc7539
+
+## Design
+
+There are two main designs for an encryption/decryption API: either
+having one state/context struct with a method which is called repeatedly
+to encrypt/decrypt the next fragment of data, or having a single
+standalone function which is called once and does all the work in a
+single call.
+
+For authenticated encryption, it's important that on decryption no
+output is produced until the authentication tag is verified. That
+requires two passes over the data for decryption: the first pass
+verifies the tag, and the second pass does the output. It would be
+needlessly complex to implement this with a state/context struct, so
+this crate uses a single function call to do the whole decryption. For
+simmetry, the same design is used for the encryption function.
+
+The base primitives (ChaCha20 and Poly1305) are not exposed separately,
+since they are harder to use securely. This also allows their
+implementation to be tuned to the combined use case; for instance, the
+base primitives need no buffering.
+
+## Limitations
+
+The amount of data that can be encrypted in a single call is 2^32 - 1
+blocks of 64 bytes, slightly less than 256 GiB. This limit could be
+increased to 2^64 bytes, if necessary, by allowing the use of a shorter
+nonce.
+
+This crate does not attempt to clear potentially sensitive data from its
+work memory (which includes the the stack and processor registers). To
+do so correctly without a heavy performance penalty would require help
+from the compiler. It's better to not attempt to do so than to present a
+false assurance.
+
+## SIMD optimization
+
+This crate has experimental support for explicit SIMD optimizations. It
+requires nightly Rust due to the use of unstable features.
+
+The following cargo features enable the explicit SIMD optimization:
+
+* `simd` enables the explicit use of SIMD vectors instead of a plain
+  struct
+* `simd_opt` additionally enables the use of SIMD shuffles to implement
+  some of the rotates
+
+While one might expect that each of these is faster than the previous
+one, and that they are all faster than not enabling explicit SIMD
+vectors, that's not always the case. It can vary depending on target
+architecture and compiler options. If you need the extra speed from
+these optimizations, benchmark each one (the `bench` feature enables
+`cargo bench` in this crate, so you can use for instance `cargo bench
+--features="bench simd_opt"`). They have currently been tuned for SSE2
+(x86 and x86-64) and NEON (arm).
+
+## License
+
+Licensed under either of
+
+ * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+
+at your option.
+
+### Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally
+submitted for inclusion in the work by you, as defined in the Apache-2.0
+license, shall be dual licensed as above, without any additional terms or
+conditions.
diff --git a/src/aead.rs b/src/aead.rs
new file mode 100644
index 0000000..d9a378e
--- /dev/null
+++ b/src/aead.rs
@@ -0,0 +1,476 @@
+// Copyright 2016 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+use std::error::Error;
+use std::fmt::{self, Display, Formatter};
+use std::io::{self, ErrorKind, Read, Write};
+
+use crate::as_bytes::AsBytes;
+use crate::chacha20::ChaCha20;
+use constant_time_eq::constant_time_eq;
+use crate::poly1305::Poly1305;
+use crate::simd::u32x4;
+
+const CHACHA20_COUNTER_OVERFLOW: u64 = ((1 << 32) - 1) * 64;
+
+/// Encrypts a byte slice and returns the authentication tag.
+///
+/// # Example
+///
+/// ```
+/// use chacha20_poly1305_aead::encrypt;
+///
+/// let key = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+///            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
+/// let nonce = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+/// let aad = [1, 2, 3, 4];
+///
+/// let plaintext = b"hello, world";
+///
+/// // Vec implements the Write trait
+/// let mut ciphertext = Vec::with_capacity(plaintext.len());
+///
+/// let tag = encrypt(&key, &nonce, &aad, plaintext, &mut ciphertext).unwrap();
+///
+/// assert_eq!(ciphertext, [0xfc, 0x5a, 0x17, 0x82,
+///     0xab, 0xcf, 0xbc, 0x5d, 0x18, 0x29, 0xbf, 0x97]);
+/// assert_eq!(tag, [0xdb, 0xb7, 0x0d, 0xda, 0xbd, 0xfa, 0x8c, 0xa5,
+///                  0x60, 0xa2, 0x30, 0x3d, 0xe6, 0x07, 0x92, 0x10]);
+/// ```
+pub fn encrypt<W: Write>(key: &[u8], nonce: &[u8],
+                         aad: &[u8], mut input: &[u8],
+                         output: &mut W) -> io::Result<[u8; 16]> {
+    encrypt_read(key, nonce, aad, &mut input, output)
+}
+
+/// Encrypts bytes from a reader and returns the authentication tag.
+///
+/// This function is identical to the `encrypt` function, the only
+/// difference being that its input comes from a reader instead of a
+/// byte slice.
+pub fn encrypt_read<R: Read, W: Write>(key: &[u8], nonce: &[u8],
+                                       aad: &[u8], input: &mut R,
+                                       output: &mut W) -> io::Result<[u8; 16]> {
+    let mut chacha20 = ChaCha20::new(key, nonce);
+    let mut poly1305 = Poly1305::new(&chacha20.next().as_bytes()[..32]);
+
+    let aad_len = aad.len() as u64;
+    let mut input_len = 0;
+
+    poly1305.padded_blocks(aad);
+
+    let mut buf = [u32x4::default(); 4];
+    loop {
+        let read = read_all(input, buf.as_mut_bytes())?;
+        if read == 0 { break; }
+
+        input_len += read as u64;
+        if input_len >= CHACHA20_COUNTER_OVERFLOW {
+            return Err(io::Error::new(ErrorKind::WriteZero,
+                                      "counter overflow"));
+        }
+
+        let block = chacha20.next();
+        buf[0] = buf[0] ^ block[0];
+        buf[1] = buf[1] ^ block[1];
+        buf[2] = buf[2] ^ block[2];
+        buf[3] = buf[3] ^ block[3];
+
+        poly1305.padded_blocks(&buf.as_bytes()[..read]);
+        output.write_all(&buf.as_bytes()[..read])?;
+    }
+
+    poly1305.block([aad_len.to_le(), input_len.to_le()].as_bytes());
+
+    let mut tag = [0; 16];
+    tag.clone_from_slice(poly1305.tag().as_bytes());
+    Ok(tag)
+}
+
+/// Verifies the authentication tag and decrypts a byte slice.
+///
+/// If the tag does not match, this function produces no output and
+/// returns `Err(DecryptError::TagMismatch)`.
+///
+/// # Example
+///
+/// ```
+/// # use chacha20_poly1305_aead::DecryptError;
+/// # fn example() -> Result<(), DecryptError> {
+/// use chacha20_poly1305_aead::decrypt;
+///
+/// let key = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+///            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
+/// let nonce = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+/// let aad = [1, 2, 3, 4];
+///
+/// let ciphertext = [0xfc, 0x5a, 0x17, 0x82, 0xab, 0xcf, 0xbc, 0x5d,
+///                   0x18, 0x29, 0xbf, 0x97];
+/// let tag = [0xdb, 0xb7, 0x0d, 0xda, 0xbd, 0xfa, 0x8c, 0xa5,
+///            0x60, 0xa2, 0x30, 0x3d, 0xe6, 0x07, 0x92, 0x10];
+///
+/// // Vec implements the Write trait
+/// let mut plaintext = Vec::with_capacity(ciphertext.len());
+///
+/// try!(decrypt(&key, &nonce, &aad, &ciphertext, &tag, &mut plaintext));
+///
+/// assert_eq!(plaintext, b"hello, world");
+/// # Ok(())
+/// # }
+/// # example().unwrap();
+/// ```
+pub fn decrypt<W: Write>(key: &[u8], nonce: &[u8],
+                         aad: &[u8], mut input: &[u8], tag: &[u8],
+                         output: &mut W) -> Result<(), DecryptError> {
+    let mut chacha20 = ChaCha20::new(key, nonce);
+    let mut poly1305 = Poly1305::new(&chacha20.next().as_bytes()[..32]);
+
+    let aad_len = aad.len() as u64;
+    let input_len = input.len() as u64;
+    assert!(tag.len() == 16);
+
+    if input_len >= CHACHA20_COUNTER_OVERFLOW {
+        return Err(io::Error::new(ErrorKind::WriteZero,
+                                  "counter overflow").into());
+    }
+
+    poly1305.padded_blocks(aad);
+    poly1305.padded_blocks(input);
+    poly1305.block([aad_len.to_le(), input_len.to_le()].as_bytes());
+
+    if !constant_time_eq(poly1305.tag().as_bytes(), tag) {
+        return Err(DecryptError::TagMismatch);
+    }
+
+    let mut buf = [u32x4::default(); 4];
+    loop {
+        let read = read_all(&mut input, buf.as_mut_bytes())?;
+        if read == 0 { break; }
+
+        let block = chacha20.next();
+        buf[0] = buf[0] ^ block[0];
+        buf[1] = buf[1] ^ block[1];
+        buf[2] = buf[2] ^ block[2];
+        buf[3] = buf[3] ^ block[3];
+
+        output.write_all(&buf.as_bytes()[..read])?;
+    }
+
+    Ok(())
+}
+
+fn read_all<R: Read>(reader: &mut R, mut buf: &mut [u8]) -> io::Result<usize> {
+    let mut read = 0;
+    while !buf.is_empty() {
+        match reader.read(buf) {
+            Ok(0) => break,
+            Ok(n) => { read += n; let tmp = buf; buf = &mut tmp[n..]; }
+            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+            Err(e) => return Err(e),
+        }
+    }
+    Ok(read)
+}
+
+/// Error returned from the `decrypt` function.
+#[derive(Debug)]
+pub enum DecryptError {
+    /// The calculated Poly1305 tag did not match the given tag.
+    TagMismatch,
+
+    /// There was an error writing the output.
+    IoError(io::Error),
+}
+
+impl Display for DecryptError {
+    fn fmt(&self, fmt: &mut Formatter) -> fmt::Result {
+        match *self {
+            DecryptError::TagMismatch => fmt.write_str(self.description()),
+            DecryptError::IoError(ref e) => e.fmt(fmt),
+        }
+    }
+}
+
+impl Error for DecryptError {
+    fn description(&self) -> &str {
+        match *self {
+            DecryptError::TagMismatch => "authentication tag mismatch",
+            DecryptError::IoError(ref e) => e.description(),
+        }
+    }
+
+    fn cause(&self) -> Option<&dyn Error> {
+        match *self {
+            DecryptError::TagMismatch => None,
+            DecryptError::IoError(ref e) => Some(e),
+        }
+    }
+}
+
+impl From<io::Error> for DecryptError {
+    fn from(error: io::Error) -> Self {
+        DecryptError::IoError(error)
+    }
+}
+
+impl From<DecryptError> for io::Error {
+    fn from(error: DecryptError) -> Self {
+        match error {
+            DecryptError::IoError(e) => e,
+            DecryptError::TagMismatch =>
+                io::Error::new(ErrorKind::InvalidData, error),
+        }
+    }
+}
+
+pub mod selftest {
+    use super::*;
+
+    static PLAINTEXT: &'static [u8] = b"\
+        Ladies and Gentlemen of the class of '99: If I could offer you o\
+        nly one tip for the future, sunscreen would be it.";
+
+    static AAD: &'static [u8] = &[0x50, 0x51, 0x52, 0x53,
+        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7];
+
+    static KEY: &'static [u8] = &[
+        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f];
+
+    static NONCE: &'static [u8] = &[0x07, 0x00, 0x00, 0x00,
+        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47];
+
+    static CIPHERTEXT: &'static [u8] = &[
+        0xd3, 0x1a, 0x8d, 0x34, 0x64, 0x8e, 0x60, 0xdb,
+        0x7b, 0x86, 0xaf, 0xbc, 0x53, 0xef, 0x7e, 0xc2,
+        0xa4, 0xad, 0xed, 0x51, 0x29, 0x6e, 0x08, 0xfe,
+        0xa9, 0xe2, 0xb5, 0xa7, 0x36, 0xee, 0x62, 0xd6,
+        0x3d, 0xbe, 0xa4, 0x5e, 0x8c, 0xa9, 0x67, 0x12,
+        0x82, 0xfa, 0xfb, 0x69, 0xda, 0x92, 0x72, 0x8b,
+        0x1a, 0x71, 0xde, 0x0a, 0x9e, 0x06, 0x0b, 0x29,
+        0x05, 0xd6, 0xa5, 0xb6, 0x7e, 0xcd, 0x3b, 0x36,
+        0x92, 0xdd, 0xbd, 0x7f, 0x2d, 0x77, 0x8b, 0x8c,
+        0x98, 0x03, 0xae, 0xe3, 0x28, 0x09, 0x1b, 0x58,
+        0xfa, 0xb3, 0x24, 0xe4, 0xfa, 0xd6, 0x75, 0x94,
+        0x55, 0x85, 0x80, 0x8b, 0x48, 0x31, 0xd7, 0xbc,
+        0x3f, 0xf4, 0xde, 0xf0, 0x8e, 0x4b, 0x7a, 0x9d,
+        0xe5, 0x76, 0xd2, 0x65, 0x86, 0xce, 0xc6, 0x4b,
+        0x61, 0x16];
+
+    static TAG: &'static [u8] = &[
+        0x1a, 0xe1, 0x0b, 0x59, 0x4f, 0x09, 0xe2, 0x6a,
+        0x7e, 0x90, 0x2e, 0xcb, 0xd0, 0x60, 0x06, 0x91];
+
+    #[cold]
+    pub fn selftest() {
+        selftest_encrypt();
+        selftest_decrypt();
+        selftest_decrypt_mismatch();
+    }
+
+    #[cold]
+    pub fn selftest_encrypt() {
+        let mut output = Vec::with_capacity(PLAINTEXT.len());
+        let tag = encrypt(KEY, NONCE, AAD, PLAINTEXT, &mut output)
+            .expect("selftest failure");
+
+        assert_eq!(&output[..], CIPHERTEXT);
+        assert_eq!(tag, TAG);
+    }
+
+    #[cold]
+    pub fn selftest_decrypt() {
+        let mut output = Vec::with_capacity(CIPHERTEXT.len());
+        decrypt(KEY, NONCE, AAD, CIPHERTEXT, TAG, &mut output)
+            .expect("selftest failure");
+
+        assert_eq!(&output[..], PLAINTEXT);
+    }
+
+    #[cold]
+    pub fn selftest_decrypt_mismatch() {
+        let mut output = Vec::with_capacity(0);
+        let result = decrypt(KEY, NONCE, AAD, CIPHERTEXT, &[0; 16],
+                             &mut output);
+
+        if let Err(DecryptError::TagMismatch) = result {
+            assert!(output.is_empty());
+        } else {
+            panic!("selftest failure");
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn selftest_encrypt() {
+        selftest::selftest_encrypt();
+    }
+
+    #[test]
+    fn selftest_decrypt() {
+        selftest::selftest_decrypt();
+    }
+
+    #[test]
+    fn selftest_decrypt_mismatch() {
+        selftest::selftest_decrypt_mismatch();
+    }
+
+    #[test]
+    fn test_encrypt() {
+        let mut output = Vec::with_capacity(PLAINTEXT.len());
+        let tag = encrypt(KEY, NONCE, AAD, PLAINTEXT.as_bytes(),
+                          &mut output).expect("test failed");
+        assert_eq!(&output[..], CIPHERTEXT);
+        assert_eq!(tag, TAG);
+    }
+
+    #[test]
+    fn test_decrypt() {
+        let mut output = Vec::with_capacity(CIPHERTEXT.len());
+        decrypt(KEY, NONCE, AAD, CIPHERTEXT, TAG,
+                &mut output).expect("test failed");
+        assert_eq!(&output[..], PLAINTEXT.as_bytes());
+    }
+
+    static KEY: &'static [u8] = &[
+        0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a,
+        0xf3, 0x33, 0x88, 0x86, 0x04, 0xf6, 0xb5, 0xf0,
+        0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b, 0x80, 0x09,
+        0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0];
+
+    static CIPHERTEXT: &'static [u8] = &[
+        0x64, 0xa0, 0x86, 0x15, 0x75, 0x86, 0x1a, 0xf4,
+        0x60, 0xf0, 0x62, 0xc7, 0x9b, 0xe6, 0x43, 0xbd,
+        0x5e, 0x80, 0x5c, 0xfd, 0x34, 0x5c, 0xf3, 0x89,
+        0xf1, 0x08, 0x67, 0x0a, 0xc7, 0x6c, 0x8c, 0xb2,
+        0x4c, 0x6c, 0xfc, 0x18, 0x75, 0x5d, 0x43, 0xee,
+        0xa0, 0x9e, 0xe9, 0x4e, 0x38, 0x2d, 0x26, 0xb0,
+        0xbd, 0xb7, 0xb7, 0x3c, 0x32, 0x1b, 0x01, 0x00,
+        0xd4, 0xf0, 0x3b, 0x7f, 0x35, 0x58, 0x94, 0xcf,
+        0x33, 0x2f, 0x83, 0x0e, 0x71, 0x0b, 0x97, 0xce,
+        0x98, 0xc8, 0xa8, 0x4a, 0xbd, 0x0b, 0x94, 0x81,
+        0x14, 0xad, 0x17, 0x6e, 0x00, 0x8d, 0x33, 0xbd,
+        0x60, 0xf9, 0x82, 0xb1, 0xff, 0x37, 0xc8, 0x55,
+        0x97, 0x97, 0xa0, 0x6e, 0xf4, 0xf0, 0xef, 0x61,
+        0xc1, 0x86, 0x32, 0x4e, 0x2b, 0x35, 0x06, 0x38,
+        0x36, 0x06, 0x90, 0x7b, 0x6a, 0x7c, 0x02, 0xb0,
+        0xf9, 0xf6, 0x15, 0x7b, 0x53, 0xc8, 0x67, 0xe4,
+        0xb9, 0x16, 0x6c, 0x76, 0x7b, 0x80, 0x4d, 0x46,
+        0xa5, 0x9b, 0x52, 0x16, 0xcd, 0xe7, 0xa4, 0xe9,
+        0x90, 0x40, 0xc5, 0xa4, 0x04, 0x33, 0x22, 0x5e,
+        0xe2, 0x82, 0xa1, 0xb0, 0xa0, 0x6c, 0x52, 0x3e,
+        0xaf, 0x45, 0x34, 0xd7, 0xf8, 0x3f, 0xa1, 0x15,
+        0x5b, 0x00, 0x47, 0x71, 0x8c, 0xbc, 0x54, 0x6a,
+        0x0d, 0x07, 0x2b, 0x04, 0xb3, 0x56, 0x4e, 0xea,
+        0x1b, 0x42, 0x22, 0x73, 0xf5, 0x48, 0x27, 0x1a,
+        0x0b, 0xb2, 0x31, 0x60, 0x53, 0xfa, 0x76, 0x99,
+        0x19, 0x55, 0xeb, 0xd6, 0x31, 0x59, 0x43, 0x4e,
+        0xce, 0xbb, 0x4e, 0x46, 0x6d, 0xae, 0x5a, 0x10,
+        0x73, 0xa6, 0x72, 0x76, 0x27, 0x09, 0x7a, 0x10,
+        0x49, 0xe6, 0x17, 0xd9, 0x1d, 0x36, 0x10, 0x94,
+        0xfa, 0x68, 0xf0, 0xff, 0x77, 0x98, 0x71, 0x30,
+        0x30, 0x5b, 0xea, 0xba, 0x2e, 0xda, 0x04, 0xdf,
+        0x99, 0x7b, 0x71, 0x4d, 0x6c, 0x6f, 0x2c, 0x29,
+        0xa6, 0xad, 0x5c, 0xb4, 0x02, 0x2b, 0x02, 0x70,
+        0x9b];
+
+    static NONCE: &'static [u8] = &[0x00, 0x00, 0x00, 0x00,
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
+
+    static AAD: &'static [u8] = &[0xf3, 0x33, 0x88, 0x86,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4e, 0x91];
+
+    static TAG: &'static [u8] = &[
+        0xee, 0xad, 0x9d, 0x67, 0x89, 0x0c, 0xbb, 0x22,
+        0x39, 0x23, 0x36, 0xfe, 0xa1, 0x85, 0x1f, 0x38];
+
+    static PLAINTEXT: &'static str = "\
+        Internet-Drafts are draft documents valid for a maximum of six m\
+        onths and may be updated, replaced, or obsoleted by other docume\
+        nts at any time. It is inappropriate to use Internet-Drafts as r\
+        eference material or to cite them other than as /\u{201c}work in prog\
+        ress./\u{201d}";
+}
+
+#[cfg(all(feature = "bench", test))]
+mod bench {
+    use test::{Bencher, black_box};
+    use super::*;
+
+    #[cfg_attr(feature = "clippy", allow(result_unwrap_used))]
+    fn bench_encrypt(b: &mut Bencher, aad: &[u8], data: &[u8]) {
+        let key = [!0; 32];
+        let nonce = [!0; 12];
+
+        let mut buf = Vec::with_capacity(data.len());
+
+        b.bytes = data.len() as u64;
+        b.iter(|| {
+            buf.clear();
+            encrypt(black_box(&key), black_box(&nonce),
+                    black_box(aad), black_box(data),
+                    black_box(&mut buf)).unwrap()
+        })
+    }
+
+    #[cfg_attr(feature = "clippy", allow(result_unwrap_used))]
+    fn bench_decrypt(b: &mut Bencher, aad: &[u8], data: &[u8]) {
+        let key = [!0; 32];
+        let nonce = [!0; 12];
+
+        let mut ciphertext = Vec::with_capacity(data.len());
+        let tag = encrypt(&key, &nonce, aad, data, &mut ciphertext).unwrap();
+        let input = &ciphertext[..];
+
+        let mut buf = Vec::with_capacity(data.len());
+
+        b.bytes = data.len() as u64;
+        b.iter(|| {
+            buf.clear();
+            decrypt(black_box(&key), black_box(&nonce),
+                    black_box(aad), black_box(input), black_box(&tag),
+                    black_box(&mut buf)).unwrap()
+        })
+    }
+
+    #[bench]
+    fn bench_encrypt_16(b: &mut Bencher) {
+        bench_encrypt(b, &[!0; 16], &[!0; 16])
+    }
+
+    #[bench]
+    fn bench_encrypt_4k(b: &mut Bencher) {
+        bench_encrypt(b, &[!0; 16], &[!0; 4096])
+    }
+
+    #[bench]
+    fn bench_encrypt_64k(b: &mut Bencher) {
+        bench_encrypt(b, &[!0; 16], &[!0; 65536])
+    }
+
+    #[bench]
+    fn bench_decrypt_16(b: &mut Bencher) {
+        bench_decrypt(b, &[!0; 16], &[!0; 16])
+    }
+
+    #[bench]
+    fn bench_decrypt_4k(b: &mut Bencher) {
+        bench_decrypt(b, &[!0; 16], &[!0; 4096])
+    }
+
+    #[bench]
+    fn bench_decrypt_64k(b: &mut Bencher) {
+        bench_decrypt(b, &[!0; 16], &[!0; 65536])
+    }
+}
diff --git a/src/as_bytes.rs b/src/as_bytes.rs
new file mode 100644
index 0000000..bb61cdf
--- /dev/null
+++ b/src/as_bytes.rs
@@ -0,0 +1,43 @@
+// Copyright 2016 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+use std::mem;
+use std::slice;
+
+pub unsafe trait Safe {}
+
+pub trait AsBytes {
+    fn as_bytes(&self) -> &[u8];
+    fn as_mut_bytes(&mut self) -> &mut [u8];
+}
+
+impl<T: Safe> AsBytes for [T] {
+    #[inline]
+    fn as_bytes(&self) -> &[u8] {
+        unsafe {
+            slice::from_raw_parts(self.as_ptr() as *const u8,
+                                  self.len() * mem::size_of::<T>())
+        }
+    }
+
+    #[inline]
+    fn as_mut_bytes(&mut self) -> &mut [u8] {
+        unsafe {
+            slice::from_raw_parts_mut(self.as_mut_ptr() as *mut u8,
+                                      self.len() * mem::size_of::<T>())
+        }
+    }
+}
+
+unsafe impl Safe for u8 {}
+unsafe impl Safe for u16 {}
+unsafe impl Safe for u32 {}
+unsafe impl Safe for u64 {}
+unsafe impl Safe for i8 {}
+unsafe impl Safe for i16 {}
+unsafe impl Safe for i32 {}
+unsafe impl Safe for i64 {}
diff --git a/src/chacha20.rs b/src/chacha20.rs
new file mode 100644
index 0000000..08d9bf7
--- /dev/null
+++ b/src/chacha20.rs
@@ -0,0 +1,250 @@
+// Copyright 2016 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+use crate::as_bytes::AsBytes;
+use crate::simd::{Vector4, u32x4};
+
+#[derive(Clone, Debug)]
+pub struct ChaCha20 {
+    state: [u32x4; 3]
+}
+
+#[cfg_attr(feature = "clippy", allow(should_implement_trait))]
+impl ChaCha20 {
+    pub fn new(key: &[u8], nonce: &[u8]) -> Self {
+        Self::with_counter(key, nonce, 0)
+    }
+
+    pub fn with_counter(key: &[u8], nonce: &[u8], counter: u32) -> Self {
+        assert!(key.len() == 32);
+        assert!(nonce.len() == 12);
+
+        let mut k = [u32x4::default(); 2];
+        k.as_mut_bytes().clone_from_slice(key);
+
+        let mut n = [0; 3];
+        n.as_mut_bytes().clone_from_slice(nonce);
+
+        ChaCha20 {
+            state: [
+                k[0].from_le(),
+                k[1].from_le(),
+                u32x4::new(counter.to_le(), n[0], n[1], n[2]).from_le(),
+            ]
+        }
+    }
+
+    fn round(state: &mut [u32x4; 4]) {
+        state[0] = state[0].wrapping_add(state[1]);
+        state[3] = (state[3] ^ state[0]).rotate_left_const(16);
+
+        state[2] = state[2].wrapping_add(state[3]);
+        state[1] = (state[1] ^ state[2]).rotate_left_const(12);
+
+        state[0] = state[0].wrapping_add(state[1]);
+        state[3] = (state[3] ^ state[0]).rotate_left_const(8);
+
+        state[2] = state[2].wrapping_add(state[3]);
+        state[1] = (state[1] ^ state[2]).rotate_left_const(7);
+    }
+
+    fn shuffle(state: &mut [u32x4; 4]) {
+        state[1] = state[1].shuffle_left_1();
+        state[2] = state[2].shuffle_left_2();
+        state[3] = state[3].shuffle_left_3();
+    }
+
+    fn unshuffle(state: &mut [u32x4; 4]) {
+        state[1] = state[1].shuffle_right_1();
+        state[2] = state[2].shuffle_right_2();
+        state[3] = state[3].shuffle_right_3();
+    }
+
+    fn round_pair(state: &mut [u32x4; 4]) {
+        ChaCha20::round(state);
+        ChaCha20::shuffle(state);
+        ChaCha20::round(state);
+        ChaCha20::unshuffle(state);
+    }
+
+    fn block(&self) -> [u32x4; 4] {
+        let c = u32x4::new(0x61707865, 0x3320646e, 0x79622d32, 0x6b206574);
+        let mut state = [c, self.state[0], self.state[1], self.state[2]];
+
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+        ChaCha20::round_pair(&mut state);
+
+        [
+            state[0].wrapping_add(c).to_le(),
+            state[1].wrapping_add(self.state[0]).to_le(),
+            state[2].wrapping_add(self.state[1]).to_le(),
+            state[3].wrapping_add(self.state[2]).to_le(),
+        ]
+    }
+
+    pub fn next(&mut self) -> [u32x4; 4] {
+        let block = self.block();
+        self.state[2].0 = self.state[2].0.wrapping_add(1);
+        block
+    }
+}
+
+/// Runs the self-test for the chacha20 block function.
+#[cold]
+pub fn selftest() {
+    let key = [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+               0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+               0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+               0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f];
+    let nonce = [0x00, 0x00, 0x00, 0x09,
+                 0x00, 0x00, 0x00, 0x4a,
+                 0x00, 0x00, 0x00, 0x00];
+    let expected = [0x10, 0xf1, 0xe7, 0xe4, 0xd1, 0x3b, 0x59, 0x15,
+                    0x50, 0x0f, 0xdd, 0x1f, 0xa3, 0x20, 0x71, 0xc4,
+                    0xc7, 0xd1, 0xf4, 0xc7, 0x33, 0xc0, 0x68, 0x03,
+                    0x04, 0x22, 0xaa, 0x9a, 0xc3, 0xd4, 0x6c, 0x4e,
+                    0xd2, 0x82, 0x64, 0x46, 0x07, 0x9f, 0xaa, 0x09,
+                    0x14, 0xc2, 0xd7, 0x05, 0xd9, 0x8b, 0x02, 0xa2,
+                    0xb5, 0x12, 0x9c, 0xd1, 0xde, 0x16, 0x4e, 0xb9,
+                    0xcb, 0xd0, 0x83, 0xe8, 0xa2, 0x50, 0x3c, 0x4e];
+
+    let mut state = ChaCha20::with_counter(&key, &nonce, 1);
+    let block = state.next();
+    assert_eq!(block.as_bytes(), &expected[..]);
+}
+
+#[cfg(test)]
+mod tests {
+    use as_bytes::AsBytes;
+    use super::ChaCha20;
+
+    #[test]
+    fn selftest() {
+        super::selftest();
+    }
+
+    #[test]
+    fn test_vector_1_and_2() {
+        let mut state = ChaCha20::new(&[0; 32], &[0; 12]);
+
+        assert_eq!(state.next().as_bytes(),
+                   &[0x76, 0xb8, 0xe0, 0xad, 0xa0, 0xf1, 0x3d, 0x90,
+                     0x40, 0x5d, 0x6a, 0xe5, 0x53, 0x86, 0xbd, 0x28,
+                     0xbd, 0xd2, 0x19, 0xb8, 0xa0, 0x8d, 0xed, 0x1a,
+                     0xa8, 0x36, 0xef, 0xcc, 0x8b, 0x77, 0x0d, 0xc7,
+                     0xda, 0x41, 0x59, 0x7c, 0x51, 0x57, 0x48, 0x8d,
+                     0x77, 0x24, 0xe0, 0x3f, 0xb8, 0xd8, 0x4a, 0x37,
+                     0x6a, 0x43, 0xb8, 0xf4, 0x15, 0x18, 0xa1, 0x1c,
+                     0xc3, 0x87, 0xb6, 0x69, 0xb2, 0xee, 0x65, 0x86][..]);
+
+        assert_eq!(state.next().as_bytes(),
+                   &[0x9f, 0x07, 0xe7, 0xbe, 0x55, 0x51, 0x38, 0x7a,
+                     0x98, 0xba, 0x97, 0x7c, 0x73, 0x2d, 0x08, 0x0d,
+                     0xcb, 0x0f, 0x29, 0xa0, 0x48, 0xe3, 0x65, 0x69,
+                     0x12, 0xc6, 0x53, 0x3e, 0x32, 0xee, 0x7a, 0xed,
+                     0x29, 0xb7, 0x21, 0x76, 0x9c, 0xe6, 0x4e, 0x43,
+                     0xd5, 0x71, 0x33, 0xb0, 0x74, 0xd8, 0x39, 0xd5,
+                     0x31, 0xed, 0x1f, 0x28, 0x51, 0x0a, 0xfb, 0x45,
+                     0xac, 0xe1, 0x0a, 0x1f, 0x4b, 0x79, 0x4d, 0x6f][..]);
+    }
+
+    #[test]
+    fn test_vector_3() {
+        let key = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01];
+
+        let mut state = ChaCha20::with_counter(&key, &[0; 12], 1);
+
+        assert_eq!(state.next().as_bytes(),
+                   &[0x3a, 0xeb, 0x52, 0x24, 0xec, 0xf8, 0x49, 0x92,
+                     0x9b, 0x9d, 0x82, 0x8d, 0xb1, 0xce, 0xd4, 0xdd,
+                     0x83, 0x20, 0x25, 0xe8, 0x01, 0x8b, 0x81, 0x60,
+                     0xb8, 0x22, 0x84, 0xf3, 0xc9, 0x49, 0xaa, 0x5a,
+                     0x8e, 0xca, 0x00, 0xbb, 0xb4, 0xa7, 0x3b, 0xda,
+                     0xd1, 0x92, 0xb5, 0xc4, 0x2f, 0x73, 0xf2, 0xfd,
+                     0x4e, 0x27, 0x36, 0x44, 0xc8, 0xb3, 0x61, 0x25,
+                     0xa6, 0x4a, 0xdd, 0xeb, 0x00, 0x6c, 0x13, 0xa0][..]);
+    }
+
+    #[test]
+    fn test_vector_4() {
+        let key = [0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = ChaCha20::with_counter(&key, &[0; 12], 2);
+
+        assert_eq!(state.next().as_bytes(),
+                   &[0x72, 0xd5, 0x4d, 0xfb, 0xf1, 0x2e, 0xc4, 0x4b,
+                     0x36, 0x26, 0x92, 0xdf, 0x94, 0x13, 0x7f, 0x32,
+                     0x8f, 0xea, 0x8d, 0xa7, 0x39, 0x90, 0x26, 0x5e,
+                     0xc1, 0xbb, 0xbe, 0xa1, 0xae, 0x9a, 0xf0, 0xca,
+                     0x13, 0xb2, 0x5a, 0xa2, 0x6c, 0xb4, 0xa6, 0x48,
+                     0xcb, 0x9b, 0x9d, 0x1b, 0xe6, 0x5b, 0x2c, 0x09,
+                     0x24, 0xa6, 0x6c, 0x54, 0xd5, 0x45, 0xec, 0x1b,
+                     0x73, 0x74, 0xf4, 0x87, 0x2e, 0x99, 0xf0, 0x96][..]);
+    }
+
+    #[test]
+    fn test_vector_5() {
+        let nonce = [0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x02];
+
+        let mut state = ChaCha20::with_counter(&[0; 32], &nonce, 0);
+
+        assert_eq!(state.next().as_bytes(),
+                   &[0xc2, 0xc6, 0x4d, 0x37, 0x8c, 0xd5, 0x36, 0x37,
+                     0x4a, 0xe2, 0x04, 0xb9, 0xef, 0x93, 0x3f, 0xcd,
+                     0x1a, 0x8b, 0x22, 0x88, 0xb3, 0xdf, 0xa4, 0x96,
+                     0x72, 0xab, 0x76, 0x5b, 0x54, 0xee, 0x27, 0xc7,
+                     0x8a, 0x97, 0x0e, 0x0e, 0x95, 0x5c, 0x14, 0xf3,
+                     0xa8, 0x8e, 0x74, 0x1b, 0x97, 0xc2, 0x86, 0xf7,
+                     0x5f, 0x8f, 0xc2, 0x99, 0xe8, 0x14, 0x83, 0x62,
+                     0xfa, 0x19, 0x8a, 0x39, 0x53, 0x1b, 0xed, 0x6d][..]);
+    }
+}
+
+#[cfg(all(feature = "bench", test))]
+mod bench {
+    use test::{Bencher, black_box};
+    use super::ChaCha20;
+
+    #[bench]
+    fn bench_new(b: &mut Bencher) {
+        let key = [!0; 32];
+        let nonce = [!0; 12];
+        let mut counter = 0;
+
+        b.bytes = 48;
+        b.iter(|| {
+            counter += 1;
+            ChaCha20::with_counter(black_box(&key), black_box(&nonce), counter)
+        })
+    }
+
+    #[bench]
+    fn bench_block(b: &mut Bencher) {
+        let mut state = ChaCha20::new(&[!0; 32], &[!0; 12]);
+
+        b.bytes = 64;
+        b.iter(|| {
+            state.next()
+        })
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..c672368
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,60 @@
+// Copyright 2016 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+//! A pure Rust implementation of the ChaCha20-Poly1305 AEAD from RFC 7539.
+//!
+//! An Authenticated Encryption with Associated Data (AEAD) mode
+//! encrypts data and generates an authentication tag, or decrypts data
+//! and verifies an authentication tag, as a single operation. The tag
+//! can also validate additional authenticated data (AAD) which is not
+//! included in the cyphertext, for instance a plaintext header.
+//!
+//! The ChaCha20-Poly1305 AEAD uses a 256-bit (32-byte) key, and a
+//! 96-bit (12-byte) nonce. For each key, a given nonce should be used
+//! only once, otherwise the encryption and authentication can be
+//! broken. One way to prevent reuse is for the nonce to contain a
+//! sequence number.
+//!
+//! The amount of data that can be encrypted in a single call is 2^32 - 1
+//! blocks of 64 bytes, slightly less than 256 GiB.
+
+#![warn(missing_docs)]
+
+#![cfg_attr(feature = "clippy", feature(plugin))]
+#![cfg_attr(feature = "clippy", plugin(clippy))]
+#![cfg_attr(feature = "clippy", warn(clippy_pedantic))]
+
+#![cfg_attr(all(feature = "bench", test), feature(test))]
+#![cfg_attr(feature = "simd", feature(platform_intrinsics, repr_simd))]
+#![cfg_attr(feature = "simd_opt", feature(cfg_target_feature))]
+
+#[cfg(all(feature = "bench", test))]
+extern crate test;
+
+extern crate constant_time_eq;
+
+mod as_bytes;
+
+mod simdty;
+mod simdint;
+mod simdop;
+mod simd_opt;
+mod simd;
+
+mod chacha20;
+mod poly1305;
+mod aead;
+
+pub use aead::{DecryptError, decrypt, encrypt, encrypt_read};
+
+/// Runs the self-test for ChaCha20, Poly1305, and the AEAD.
+#[cold]
+pub fn selftest() {
+    chacha20::selftest();
+    poly1305::selftest();
+    aead::selftest::selftest();
+}
diff --git a/src/poly1305.rs b/src/poly1305.rs
new file mode 100644
index 0000000..dfb2dce
--- /dev/null
+++ b/src/poly1305.rs
@@ -0,0 +1,536 @@
+// Copyright 2016 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+// The 130-bit accumulator is split into five 26-bit limbs, with the
+// carry between the limbs delayed.
+//
+// The reduction steps use the following identity:
+//
+// a×2^n ≡ a×c (mod 2^n−c)
+//
+// For Poly1305, the identity becomes:
+//
+// a×2^130 ≡ a×5 (mod 2^130−5)
+//
+// That is, any limb or carry above 2^130 is multiplied by 5 and added
+// back to the lower limbs.
+//
+// Based on the algorithm from https://github.com/floodyberry/poly1305-donna
+
+#[derive(Clone, Debug)]
+pub struct Poly1305 {
+    /// Accumulator: 5x26-bit
+    a: [u32; 5],
+    /// Multiplier: 5x26-bit
+    r: [u32; 5],
+    /// Secret key: 4x32-bit
+    s: [u32; 4],
+}
+
+impl Poly1305 {
+    pub fn new(key: &[u8]) -> Self {
+        assert!(key.len() == 32);
+
+        Poly1305 {
+            a: [0; 5],
+
+            // r &= 0x0ffffffc_0ffffffc_0ffffffc_0fffffff;
+            r: [u32_from_le(&key[ 0.. 4])      & 0x03ffffff,
+                u32_from_le(&key[ 3.. 7]) >> 2 & 0x03ffff03,
+                u32_from_le(&key[ 6..10]) >> 4 & 0x03ffc0ff,
+                u32_from_le(&key[ 9..13]) >> 6 & 0x03f03fff,
+                u32_from_le(&key[12..16]) >> 8 & 0x000fffff],
+
+            s: [u32_from_le(&key[16..20]),
+                u32_from_le(&key[20..24]),
+                u32_from_le(&key[24..28]),
+                u32_from_le(&key[28..32])],
+        }
+    }
+
+    pub fn block(&mut self, msg: &[u8]) {
+        assert!(msg.len() == 16);
+        self.accumulate(u32_from_le(&msg[ 0.. 4])      & 0x03ffffff,
+                        u32_from_le(&msg[ 3.. 7]) >> 2 & 0x03ffffff,
+                        u32_from_le(&msg[ 6..10]) >> 4 & 0x03ffffff,
+                        u32_from_le(&msg[ 9..13]) >> 6 & 0x03ffffff,
+                        u32_from_le(&msg[12..16]) >> 8 | (1 <<  24));
+    }
+
+    pub fn last_block(mut self, msg: &[u8]) -> [u32; 4] {
+        if !msg.is_empty() {
+            assert!(msg.len() <= 16);
+
+            let mut buf = [0; 17];
+            buf[..msg.len()].clone_from_slice(msg);
+            buf[msg.len()] = 1;
+
+            self.accumulate(u32_from_le(&buf[ 0.. 4])      & 0x03ffffff,
+                            u32_from_le(&buf[ 3.. 7]) >> 2 & 0x03ffffff,
+                            u32_from_le(&buf[ 6..10]) >> 4 & 0x03ffffff,
+                            u32_from_le(&buf[ 9..13]) >> 6 & 0x03ffffff,
+                            u32_from_le(&buf[13..17]));
+        }
+
+        self.tag()
+    }
+
+    fn padded_block(&mut self, msg: &[u8]) {
+        assert!(msg.len() <= 16);
+        let mut buf = [0; 16];
+        buf[..msg.len()].clone_from_slice(msg);
+        self.block(&buf);
+    }
+
+    pub fn padded_blocks(&mut self, mut msg: &[u8]) {
+        while msg.len() >= 16 {
+            self.block(&msg[..16]);
+            msg = &msg[16..];
+        }
+        if !msg.is_empty() {
+            self.padded_block(msg);
+        }
+    }
+
+    fn accumulate(&mut self, n0: u32, n1: u32, n2: u32, n3: u32, n4: u32) {
+        self.a[0] += n0;
+        self.a[1] += n1;
+        self.a[2] += n2;
+        self.a[3] += n3;
+        self.a[4] += n4;
+        self.mul_r_mod_p();
+    }
+
+    #[cfg_attr(feature = "clippy", allow(cast_possible_truncation))]
+    fn mul_r_mod_p(&mut self) {
+        // t = r * a; high limbs multiplied by 5 and added to low limbs
+        let mut t = [0; 5];
+
+        t[0] +=      self.r[0]  as u64 * self.a[0] as u64;
+        t[1] +=      self.r[0]  as u64 * self.a[1] as u64;
+        t[2] +=      self.r[0]  as u64 * self.a[2] as u64;
+        t[3] +=      self.r[0]  as u64 * self.a[3] as u64;
+        t[4] +=      self.r[0]  as u64 * self.a[4] as u64;
+
+        t[0] += (5 * self.r[1]) as u64 * self.a[4] as u64;
+        t[1] +=      self.r[1]  as u64 * self.a[0] as u64;
+        t[2] +=      self.r[1]  as u64 * self.a[1] as u64;
+        t[3] +=      self.r[1]  as u64 * self.a[2] as u64;
+        t[4] +=      self.r[1]  as u64 * self.a[3] as u64;
+
+        t[0] += (5 * self.r[2]) as u64 * self.a[3] as u64;
+        t[1] += (5 * self.r[2]) as u64 * self.a[4] as u64;
+        t[2] +=      self.r[2]  as u64 * self.a[0] as u64;
+        t[3] +=      self.r[2]  as u64 * self.a[1] as u64;
+        t[4] +=      self.r[2]  as u64 * self.a[2] as u64;
+
+        t[0] += (5 * self.r[3]) as u64 * self.a[2] as u64;
+        t[1] += (5 * self.r[3]) as u64 * self.a[3] as u64;
+        t[2] += (5 * self.r[3]) as u64 * self.a[4] as u64;
+        t[3] +=      self.r[3]  as u64 * self.a[0] as u64;
+        t[4] +=      self.r[3]  as u64 * self.a[1] as u64;
+
+        t[0] += (5 * self.r[4]) as u64 * self.a[1] as u64;
+        t[1] += (5 * self.r[4]) as u64 * self.a[2] as u64;
+        t[2] += (5 * self.r[4]) as u64 * self.a[3] as u64;
+        t[3] += (5 * self.r[4]) as u64 * self.a[4] as u64;
+        t[4] +=      self.r[4]  as u64 * self.a[0] as u64;
+
+        // propagate carries
+        t[1] += t[0] >> 26;
+        t[2] += t[1] >> 26;
+        t[3] += t[2] >> 26;
+        t[4] += t[3] >> 26;
+
+        // mask out carries
+        self.a[0] = t[0] as u32 & 0x03ffffff;
+        self.a[1] = t[1] as u32 & 0x03ffffff;
+        self.a[2] = t[2] as u32 & 0x03ffffff;
+        self.a[3] = t[3] as u32 & 0x03ffffff;
+        self.a[4] = t[4] as u32 & 0x03ffffff;
+
+        // propagate high limb carry
+        self.a[0] += (t[4] >> 26) as u32 * 5;
+        self.a[1] += self.a[0] >> 26;
+
+        // mask out carries
+        self.a[0] &= 0x03ffffff;
+
+        // A carry of at most 1 bit has been left in self.a[1]
+    }
+
+    fn propagate_carries(&mut self) {
+        // propagate carries
+        self.a[2] +=  self.a[1] >> 26;
+        self.a[3] +=  self.a[2] >> 26;
+        self.a[4] +=  self.a[3] >> 26;
+        self.a[0] += (self.a[4] >> 26) * 5;
+        self.a[1] +=  self.a[0] >> 26;
+
+        // mask out carries
+        self.a[0] &= 0x03ffffff;
+        self.a[1] &= 0x03ffffff;
+        self.a[2] &= 0x03ffffff;
+        self.a[3] &= 0x03ffffff;
+        self.a[4] &= 0x03ffffff;
+    }
+
+    fn reduce_mod_p(&mut self) {
+        self.propagate_carries();
+
+        let mut t = self.a;
+
+        // t = a - p
+        t[0] += 5;
+        t[4]  = t[4].wrapping_sub(1 << 26);
+
+        // propagate carries
+        t[1] +=                   t[0] >> 26;
+        t[2] +=                   t[1] >> 26;
+        t[3] +=                   t[2] >> 26;
+        t[4]  = t[4].wrapping_add(t[3] >> 26);
+
+        // mask out carries
+        t[0] &= 0x03ffffff;
+        t[1] &= 0x03ffffff;
+        t[2] &= 0x03ffffff;
+        t[3] &= 0x03ffffff;
+
+        // constant-time select between (a - p) if non-negative, (a) otherwise
+        let mask = (t[4] >> 31).wrapping_sub(1);
+        self.a[0] = t[0] & mask | self.a[0] & !mask;
+        self.a[1] = t[1] & mask | self.a[1] & !mask;
+        self.a[2] = t[2] & mask | self.a[2] & !mask;
+        self.a[3] = t[3] & mask | self.a[3] & !mask;
+        self.a[4] = t[4] & mask | self.a[4] & !mask;
+    }
+
+    #[cfg_attr(feature = "clippy", allow(cast_possible_truncation))]
+    pub fn tag(mut self) -> [u32; 4] {
+        self.reduce_mod_p();
+
+        // convert from 5x26-bit to 4x32-bit
+        let a = [self.a[0]       | self.a[1] << 26,
+                 self.a[1] >>  6 | self.a[2] << 20,
+                 self.a[2] >> 12 | self.a[3] << 14,
+                 self.a[3] >> 18 | self.a[4] <<  8];
+
+        // t = a + s
+        let mut t = [a[0] as u64 + self.s[0] as u64,
+                     a[1] as u64 + self.s[1] as u64,
+                     a[2] as u64 + self.s[2] as u64,
+                     a[3] as u64 + self.s[3] as u64];
+
+        // propagate carries
+        t[1] += t[0] >> 32;
+        t[2] += t[1] >> 32;
+        t[3] += t[2] >> 32;
+
+        // mask out carries
+        [(t[0] as u32).to_le(),
+         (t[1] as u32).to_le(),
+         (t[2] as u32).to_le(),
+         (t[3] as u32).to_le()]
+    }
+}
+
+#[inline]
+fn u32_from_le(src: &[u8]) -> u32 {
+    use std::mem::size_of;
+    use std::ptr::copy_nonoverlapping;
+
+    assert!(src.len() == size_of::<u32>());
+    unsafe {
+        let mut value = 0;
+        copy_nonoverlapping(src.as_ptr(),
+                            &mut value as *mut u32 as *mut u8,
+                            size_of::<u32>());
+        u32::from_le(value)
+    }
+}
+
+/// Runs the self-test for the poly1305 authenticator.
+#[cold]
+pub fn selftest() {
+    use crate::as_bytes::AsBytes;
+
+    let key = [0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+               0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+               0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+               0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b];
+    let msg = b"Cryptographic Forum Research Group";
+    let expected = [0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6,
+                    0xc2, 0x2b, 0x8b, 0xaf, 0x0c, 0x01, 0x27, 0xa9];
+
+    let mut state = Poly1305::new(&key);
+    state.block(&msg[ 0..16]);
+    state.block(&msg[16..32]);
+    let tag = state.last_block(&msg[32..]);
+
+    assert_eq!(tag.as_bytes(), expected);
+}
+
+#[cfg(test)]
+mod tests {
+    use as_bytes::AsBytes;
+    use super::Poly1305;
+
+    #[test]
+    fn selftest() {
+        super::selftest();
+    }
+
+    #[test]
+    fn test_vector_1() {
+        let mut state = Poly1305::new(&[0; 32]);
+        state.block(&[0; 16]);
+        state.block(&[0; 16]);
+        state.block(&[0; 16]);
+        state.block(&[0; 16]);
+        assert_eq!(state.tag().as_bytes(), &[0; 16]);
+    }
+
+    static TEXT: &'static [u8] = b"\
+        Any submission to the IETF intended by the Contributor for publi\
+        cation as all or part of an IETF Internet-Draft or RFC and any s\
+        tatement made within the context of an IETF activity is consider\
+        ed an \"IETF Contribution\". Such statements include oral statemen\
+        ts in IETF sessions, as well as written and electronic communica\
+        tions made at any time or place, which are addressed to";
+
+    #[test]
+    fn test_vector_2() {
+        let key = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x36, 0xe5, 0xf6, 0xb5, 0xc5, 0xe0, 0x60, 0x70,
+                   0xf0, 0xef, 0xca, 0x96, 0x22, 0x7a, 0x86, 0x3e];
+        let mut msg = TEXT;
+
+        let mut state = Poly1305::new(&key);
+        while msg.len() >= 16 {
+            state.block(&msg[..16]);
+            msg = &msg[16..];
+        }
+        let tag = state.last_block(msg);
+
+        assert_eq!(tag.as_bytes(),
+                   &[0x36, 0xe5, 0xf6, 0xb5, 0xc5, 0xe0, 0x60, 0x70,
+                     0xf0, 0xef, 0xca, 0x96, 0x22, 0x7a, 0x86, 0x3e]);
+    }
+
+    #[test]
+    fn test_vector_3() {
+        let key = [0x36, 0xe5, 0xf6, 0xb5, 0xc5, 0xe0, 0x60, 0x70,
+                   0xf0, 0xef, 0xca, 0x96, 0x22, 0x7a, 0x86, 0x3e,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+        let mut msg = TEXT;
+
+        let mut state = Poly1305::new(&key);
+        while msg.len() >= 16 {
+            state.block(&msg[..16]);
+            msg = &msg[16..];
+        }
+        let tag = state.last_block(msg);
+
+        assert_eq!(tag.as_bytes(),
+                   &[0xf3, 0x47, 0x7e, 0x7c, 0xd9, 0x54, 0x17, 0xaf,
+                     0x89, 0xa6, 0xb8, 0x79, 0x4c, 0x31, 0x0c, 0xf0]);
+    }
+
+    #[test]
+    fn test_vector_4() {
+        let key = [0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a,
+                   0xf3, 0x33, 0x88, 0x86, 0x04, 0xf6, 0xb5, 0xf0,
+                   0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b, 0x80, 0x09,
+                   0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0];
+        let mut msg: &[u8] = b"\
+            'Twas brillig, and the slithy toves\nDid gyre and gimble in the w\
+            abe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.";
+
+        let mut state = Poly1305::new(&key);
+        while msg.len() >= 16 {
+            state.block(&msg[..16]);
+            msg = &msg[16..];
+        }
+        let tag = state.last_block(msg);
+
+        assert_eq!(tag.as_bytes(),
+                   &[0x45, 0x41, 0x66, 0x9a, 0x7e, 0xaa, 0xee, 0x61,
+                     0xe7, 0x08, 0xdc, 0x7c, 0xbc, 0xc5, 0xeb, 0x62]);
+    }
+
+    #[test]
+    fn test_vector_5() {
+        let key = [0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0xff; 16]);
+
+        assert_eq!(state.tag().as_bytes(),
+                   &[0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+    }
+
+    #[test]
+    fn test_vector_6() {
+        let key = [0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+
+        assert_eq!(state.tag().as_bytes(),
+                   &[0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+    }
+
+    #[test]
+    fn test_vector_7() {
+        let key = [0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0xff; 16]);
+        state.block(&[0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+                      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]);
+        state.block(&[0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+
+        assert_eq!(state.tag().as_bytes(),
+                   &[0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+    }
+
+    #[test]
+    fn test_vector_8() {
+        let key = [0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0xff; 16]);
+        state.block(&[0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+                      0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe]);
+        state.block(&[0x01; 16]);
+
+        assert_eq!(state.tag().as_bytes(), &[0; 16]);
+    }
+
+    #[test]
+    fn test_vector_9() {
+        let key = [0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+                      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]);
+
+        assert_eq!(state.tag().as_bytes(),
+                   &[0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+                     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]);
+    }
+
+    #[test]
+    fn test_vector_10() {
+        let key = [0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+                      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+        state.block(&[0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+                      0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+        state.block(&[0; 16]);
+        state.block(&[0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+
+        assert_eq!(state.tag().as_bytes(),
+                   &[0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                     0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+    }
+
+    #[test]
+    fn test_vector_11() {
+        let key = [0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+
+        let mut state = Poly1305::new(&key);
+        state.block(&[0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+                      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+        state.block(&[0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+                      0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+        state.block(&[0; 16]);
+
+        assert_eq!(state.tag().as_bytes(),
+                   &[0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+    }
+}
+
+#[cfg(all(feature = "bench", test))]
+mod bench {
+    use test::{Bencher, black_box};
+    use super::Poly1305;
+
+    #[bench]
+    fn bench_new(b: &mut Bencher) {
+        let key = [!0; 32];
+
+        b.bytes = 32;
+        b.iter(|| {
+            Poly1305::new(black_box(&key))
+        })
+    }
+
+    #[bench]
+    fn bench_block(b: &mut Bencher) {
+        let mut state = Poly1305::new(&[!0; 32]);
+        let msg = [!0; 16];
+
+        b.bytes = 16;
+        b.iter(|| {
+            black_box(&mut state).block(black_box(&msg))
+        })
+    }
+
+    #[bench]
+    fn bench_last_block(b: &mut Bencher) {
+        let state = Poly1305::new(&[!0; 32]);
+        let msg = [!0; 16];
+
+        b.bytes = 16;
+        b.iter(|| {
+            black_box(&state).clone().last_block(black_box(&msg))
+        })
+    }
+
+    #[bench]
+    fn bench_tag(b: &mut Bencher) {
+        let state = Poly1305::new(&[!0; 32]);
+
+        b.bytes = 16;
+        b.iter(|| {
+            black_box(&state).clone().tag()
+        })
+    }
+}
diff --git a/src/simd.rs b/src/simd.rs
new file mode 100644
index 0000000..2771902
--- /dev/null
+++ b/src/simd.rs
@@ -0,0 +1,110 @@
+// Copyright 2015 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+#![cfg_attr(feature = "clippy", allow(inline_always))]
+
+use crate::simd_opt;
+
+pub use crate::simdty::u32x4;
+
+pub trait Vector4<T>: Copy {
+    fn from_le(self) -> Self;
+    fn to_le(self) -> Self;
+
+    fn wrapping_add(self, rhs: Self) -> Self;
+
+    fn rotate_left_const(self, n: u32) -> Self;
+
+    fn shuffle_left_1(self) -> Self;
+    fn shuffle_left_2(self) -> Self;
+    fn shuffle_left_3(self) -> Self;
+
+    #[inline(always)] fn shuffle_right_1(self) -> Self { self.shuffle_left_3() }
+    #[inline(always)] fn shuffle_right_2(self) -> Self { self.shuffle_left_2() }
+    #[inline(always)] fn shuffle_right_3(self) -> Self { self.shuffle_left_1() }
+}
+
+macro_rules! impl_vector4 {
+    ($vec:ident, $word:ident) => {
+        impl Vector4<$word> for $vec {
+            #[cfg(target_endian = "little")]
+            #[inline(always)]
+            fn from_le(self) -> Self { self }
+
+            #[cfg(not(target_endian = "little"))]
+            #[inline(always)]
+            fn from_le(self) -> Self {
+                $vec::new($word::from_le(self.0),
+                          $word::from_le(self.1),
+                          $word::from_le(self.2),
+                          $word::from_le(self.3))
+            }
+
+            #[cfg(target_endian = "little")]
+            #[inline(always)]
+            fn to_le(self) -> Self { self }
+
+            #[cfg(not(target_endian = "little"))]
+            #[inline(always)]
+            fn to_le(self) -> Self {
+                $vec::new(self.0.to_le(),
+                          self.1.to_le(),
+                          self.2.to_le(),
+                          self.3.to_le())
+            }
+
+            #[inline(always)]
+            fn wrapping_add(self, rhs: Self) -> Self { self + rhs }
+
+            #[inline(always)]
+            fn rotate_left_const(self, n: u32) -> Self {
+                simd_opt::$vec::rotate_left_const(self, n)
+            }
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn shuffle_left_1(self) -> Self {
+                use simdint::simd_shuffle4;
+                unsafe { simd_shuffle4(self, self, [1, 2, 3, 0]) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn shuffle_left_1(self) -> Self {
+                $vec::new(self.1, self.2, self.3, self.0)
+            }
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn shuffle_left_2(self) -> Self {
+                use simdint::simd_shuffle4;
+                unsafe { simd_shuffle4(self, self, [2, 3, 0, 1]) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn shuffle_left_2(self) -> Self {
+                $vec::new(self.2, self.3, self.0, self.1)
+            }
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn shuffle_left_3(self) -> Self {
+                use simdint::simd_shuffle4;
+                unsafe { simd_shuffle4(self, self, [3, 0, 1, 2]) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn shuffle_left_3(self) -> Self {
+                $vec::new(self.3, self.0, self.1, self.2)
+            }
+        }
+    }
+}
+
+impl_vector4!(u32x4, u32);
diff --git a/src/simd_opt/mod.rs b/src/simd_opt/mod.rs
new file mode 100644
index 0000000..69bef7b
--- /dev/null
+++ b/src/simd_opt/mod.rs
@@ -0,0 +1,44 @@
+// Copyright 2015 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+#![cfg_attr(feature = "clippy", allow(inline_always))]
+
+#[cfg(feature = "simd")]
+macro_rules! transmute_shuffle {
+    ($tmp:ident, $shuffle:ident, $vec:expr, $idx:expr) => {
+        unsafe {
+            use simdty::$tmp;
+            use simdint::$shuffle;
+            use std::mem::transmute;
+
+            let tmp_i: $tmp = transmute($vec);
+            let tmp_o: $tmp = $shuffle(tmp_i, tmp_i, $idx);
+            transmute(tmp_o)
+        }
+    }
+}
+
+#[cfg(feature = "simd")] pub mod u32x4;
+
+#[cfg(not(feature = "simd"))]
+macro_rules! simd_opt {
+    ($vec:ident) => {
+        pub mod $vec {
+            use crate::simdty::$vec;
+
+            #[inline(always)]
+            pub fn rotate_left_const(vec: $vec, n: u32) -> $vec {
+                $vec::new(vec.0.rotate_left(n),
+                          vec.1.rotate_left(n),
+                          vec.2.rotate_left(n),
+                          vec.3.rotate_left(n))
+            }
+        }
+    }
+}
+
+#[cfg(not(feature = "simd"))] simd_opt!(u32x4);
diff --git a/src/simd_opt/u32x4.rs b/src/simd_opt/u32x4.rs
new file mode 100644
index 0000000..bcacbb8
--- /dev/null
+++ b/src/simd_opt/u32x4.rs
@@ -0,0 +1,71 @@
+// Copyright 2015 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+#![cfg_attr(feature = "clippy", allow(inline_always))]
+
+use crate::simdty::u32x4;
+
+#[cfg(feature = "simd_opt")]
+#[inline(always)]
+pub fn rotate_left_const(vec: u32x4, n: u32) -> u32x4 {
+    match n {
+        16 => rotate_left_16(vec),
+         8 => rotate_left_8(vec),
+         _ => rotate_left_any(vec, n),
+    }
+}
+
+#[cfg(not(feature = "simd_opt"))]
+#[inline(always)]
+pub fn rotate_left_const(vec: u32x4, n: u32) -> u32x4 {
+    rotate_left_any(vec, n)
+}
+
+#[inline(always)]
+fn rotate_left_any(vec: u32x4, n: u32) -> u32x4 {
+    let l = n as u32;
+    let r = 32 - l;
+
+    (vec << u32x4::new(l, l, l, l)) ^ (vec >> u32x4::new(r, r, r, r))
+}
+
+#[cfg(feature = "simd_opt")]
+#[inline(always)]
+fn rotate_left_16(vec: u32x4) -> u32x4 {
+    if cfg!(target_feature = "ssse3") {
+        // pshufb (SSSE3) / vpshufb (AVX2)
+        transmute_shuffle!(u8x16, simd_shuffle16, vec,
+                           [ 2,  3,  0,  1,
+                             6,  7,  4,  5,
+                            10, 11,  8,  9,
+                            14, 15, 12, 13])
+    } else if cfg!(any(target_feature = "sse2", target_feature = "neon")) {
+        // pshuflw+pshufhw (SSE2) / vrev (NEON)
+        transmute_shuffle!(u16x8, simd_shuffle8, vec,
+                           [1, 0,
+                            3, 2,
+                            5, 4,
+                            7, 6])
+    } else {
+        rotate_left_any(vec, 16)
+    }
+}
+
+#[cfg(feature = "simd_opt")]
+#[inline(always)]
+fn rotate_left_8(vec: u32x4) -> u32x4 {
+    if cfg!(target_feature = "ssse3") {
+        // pshufb (SSSE3) / vpshufb (AVX2)
+        transmute_shuffle!(u8x16, simd_shuffle16, vec,
+                           [ 3,  0,  1,  2,
+                             7,  4,  5,  6,
+                            11,  8,  9, 10,
+                            15, 12, 13, 14])
+    } else {
+        rotate_left_any(vec, 8)
+    }
+}
diff --git a/src/simdint.rs b/src/simdint.rs
new file mode 100644
index 0000000..1f69acd
--- /dev/null
+++ b/src/simdint.rs
@@ -0,0 +1,20 @@
+// Copyright 2015 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+#![allow(dead_code)]
+
+#[cfg(feature = "simd")]
+extern "platform-intrinsic" {
+    pub fn simd_add<T>(x: T, y: T) -> T;
+    pub fn simd_shl<T>(x: T, y: T) -> T;
+    pub fn simd_shr<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+
+    pub fn simd_shuffle4<T, U>(v: T, w: T, idx: [u32; 4]) -> U;
+    pub fn simd_shuffle8<T, U>(v: T, w: T, idx: [u32; 8]) -> U;
+    pub fn simd_shuffle16<T, U>(v: T, w: T, idx: [u32; 16]) -> U;
+}
diff --git a/src/simdop.rs b/src/simdop.rs
new file mode 100644
index 0000000..fd99945
--- /dev/null
+++ b/src/simdop.rs
@@ -0,0 +1,93 @@
+// Copyright 2015 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+use crate::simdty::u32x4;
+#[cfg(feature = "simd")] use crate::simdint;
+
+use std::ops::{Add, BitXor, Shl, Shr};
+
+macro_rules! impl_ops {
+    ($vec:ident) => {
+        impl Add for $vec {
+            type Output = Self;
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn add(self, rhs: Self) -> Self::Output {
+                unsafe { simdint::simd_add(self, rhs) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn add(self, rhs: Self) -> Self::Output {
+                $vec::new(self.0.wrapping_add(rhs.0),
+                          self.1.wrapping_add(rhs.1),
+                          self.2.wrapping_add(rhs.2),
+                          self.3.wrapping_add(rhs.3))
+            }
+        }
+
+        impl BitXor for $vec {
+            type Output = Self;
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn bitxor(self, rhs: Self) -> Self::Output {
+                unsafe { simdint::simd_xor(self, rhs) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn bitxor(self, rhs: Self) -> Self::Output {
+                $vec::new(self.0 ^ rhs.0,
+                          self.1 ^ rhs.1,
+                          self.2 ^ rhs.2,
+                          self.3 ^ rhs.3)
+            }
+        }
+
+        impl Shl<$vec> for $vec {
+            type Output = Self;
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn shl(self, rhs: Self) -> Self::Output {
+                unsafe { simdint::simd_shl(self, rhs) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn shl(self, rhs: Self) -> Self::Output {
+                $vec::new(self.0 << rhs.0,
+                          self.1 << rhs.1,
+                          self.2 << rhs.2,
+                          self.3 << rhs.3)
+            }
+        }
+
+        impl Shr<$vec> for $vec {
+            type Output = Self;
+
+            #[cfg(feature = "simd")]
+            #[inline(always)]
+            fn shr(self, rhs: Self) -> Self::Output {
+                unsafe { simdint::simd_shr(self, rhs) }
+            }
+
+            #[cfg(not(feature = "simd"))]
+            #[inline(always)]
+            fn shr(self, rhs: Self) -> Self::Output {
+                $vec::new(self.0 >> rhs.0,
+                          self.1 >> rhs.1,
+                          self.2 >> rhs.2,
+                          self.3 >> rhs.3)
+            }
+        }
+    }
+}
+
+impl_ops!(u32x4);
diff --git a/src/simdty.rs b/src/simdty.rs
new file mode 100644
index 0000000..49ddd9b
--- /dev/null
+++ b/src/simdty.rs
@@ -0,0 +1,59 @@
+// Copyright 2016 chacha20-poly1305-aead Developers
+//
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+#![allow(dead_code)]
+#![allow(non_camel_case_types)]
+
+use crate::as_bytes::Safe;
+
+#[cfg(feature = "simd")]
+macro_rules! decl_simd {
+    ($($decl:item)*) => {
+        $(
+            #[derive(Clone, Copy, Debug, Default)]
+            #[repr(simd)]
+            $decl
+        )*
+    }
+}
+
+#[cfg(not(feature = "simd"))]
+macro_rules! decl_simd {
+    ($($decl:item)*) => {
+        $(
+            #[derive(Clone, Copy, Debug, Default)]
+            #[repr(C)]
+            $decl
+        )*
+    }
+}
+
+decl_simd! {
+    pub struct Simd4<T>(pub T, pub T, pub T, pub T);
+    pub struct Simd8<T>(pub T, pub T, pub T, pub T,
+                        pub T, pub T, pub T, pub T);
+    pub struct Simd16<T>(pub T, pub T, pub T, pub T,
+                         pub T, pub T, pub T, pub T,
+                         pub T, pub T, pub T, pub T,
+                         pub T, pub T, pub T, pub T);
+}
+
+pub type u32x4 = Simd4<u32>;
+pub type u16x8 = Simd8<u16>;
+pub type u8x16 = Simd16<u8>;
+
+#[cfg_attr(feature = "clippy", allow(inline_always))]
+impl<T> Simd4<T> {
+    #[inline(always)]
+    pub fn new(e0: T, e1: T, e2: T, e3: T) -> Simd4<T> {
+        Simd4(e0, e1, e2, e3)
+    }
+}
+
+unsafe impl<T: Safe> Safe for Simd4<T> {}
+unsafe impl<T: Safe> Safe for Simd8<T> {}
+unsafe impl<T: Safe> Safe for Simd16<T> {}