Use hardware acceleration for SHA256 (#2426)

## Proposed Changes

Modify the SHA256 implementation in `eth2_hashing` so that it switches between `ring` and `sha2` to take advantage of [x86_64 SHA extensions](https://en.wikipedia.org/wiki/Intel_SHA_extensions). The extensions are available on modern Intel and AMD CPUs, and seem to provide a considerable speed-up: on my Ryzen 5950X it dropped state tree hashing times by about 30% from 35ms to 25ms (on Prater).

## Additional Info

The extensions became available in the `sha2` crate [last year](https://www.reddit.com/r/rust/comments/hf2vcx/ann_rustcryptos_sha1_and_sha2_now_support/), and are not available in Ring, which uses a [pure Rust implementation of sha2](https://github.com/briansmith/ring/blob/main/src/digest/sha2.rs). Ring is faster on CPUs that lack the extensions so I've implemented a runtime switch to use `sha2` only when the extensions are available. The runtime switching seems to impose a miniscule penalty (see the benchmarks linked below).
This commit is contained in:
Michael Sproul
2021-07-12 08:47:01 +00:00
parent a7b7134abb
commit 2c691af95b
10 changed files with 222 additions and 86 deletions

View File

@@ -1,61 +1,205 @@
//! Provides a simple hash function utilizing `ring::digest::SHA256`.
//! Optimized SHA256 for use in Ethereum 2.0.
//!
//! The purpose of this crate is to provide an abstraction to whatever hash function Ethereum
//! 2.0 is using. The hash function has been subject to change during the specification process, so
//! defining it once in this crate makes it easy to replace.
//! The initial purpose of this crate was to provide an abstraction over the hash function used in
//! Ethereum 2.0. The hash function changed during the specification process, so defining it once in
//! this crate made it easy to replace.
//!
//! Now this crate serves primarily as a wrapper over two SHA256 crates: `sha2` and `ring`
//! which it switches between at runtime based on the availability of SHA intrinsics.
#[cfg(not(target_arch = "wasm32"))]
pub use ring::digest::{digest, Context, Digest, SHA256};
#[cfg(target_arch = "wasm32")]
use sha2::{Digest, Sha256};
pub use self::DynamicContext as Context;
use sha2::Digest;
#[cfg(feature = "zero_hash_cache")]
use lazy_static::lazy_static;
/// Returns the digest of `input`.
///
/// Uses `ring::digest::SHA256`.
/// Length of a SHA256 hash in bytes.
pub const HASH_LEN: usize = 32;
/// Returns the digest of `input` using the best available implementation.
pub fn hash(input: &[u8]) -> Vec<u8> {
#[cfg(not(target_arch = "wasm32"))]
let h = digest(&SHA256, input).as_ref().into();
DynamicImpl::best().hash(input)
}
#[cfg(target_arch = "wasm32")]
let h = Sha256::digest(input).as_ref().into();
h
/// Hash function returning a fixed-size array (to save on allocations).
///
/// Uses the best available implementation based on CPU features.
pub fn hash_fixed(input: &[u8]) -> [u8; HASH_LEN] {
DynamicImpl::best().hash_fixed(input)
}
/// Compute the hash of two slices concatenated.
///
/// # Panics
///
/// Will panic if either `h1` or `h2` are not 32 bytes in length.
#[cfg(not(target_arch = "wasm32"))]
pub fn hash32_concat(h1: &[u8], h2: &[u8]) -> [u8; 32] {
let mut context = Context::new(&SHA256);
context.update(h1);
context.update(h2);
let mut output = [0; 32];
output[..].copy_from_slice(context.finish().as_ref());
output
let mut ctxt = DynamicContext::new();
ctxt.update(h1);
ctxt.update(h2);
ctxt.finalize()
}
/// Compute the hash of two slices concatenated.
///
/// # Panics
///
/// Will panic if either `h1` or `h2` are not 32 bytes in length.
#[cfg(target_arch = "wasm32")]
pub fn hash32_concat(h1: &[u8], h2: &[u8]) -> [u8; 32] {
let mut preimage = [0; 64];
preimage[0..32].copy_from_slice(h1);
preimage[32..64].copy_from_slice(h2);
/// Context trait for abstracting over implementation contexts.
pub trait Sha256Context {
fn new() -> Self;
let mut output = [0; 32];
output[..].copy_from_slice(&hash(&preimage));
output
fn update(&mut self, bytes: &[u8]);
fn finalize(self) -> [u8; HASH_LEN];
}
/// Top-level trait implemented by both `sha2` and `ring` implementations.
pub trait Sha256 {
type Context: Sha256Context;
fn hash(&self, input: &[u8]) -> Vec<u8>;
fn hash_fixed(&self, input: &[u8]) -> [u8; HASH_LEN];
}
/// Implementation of SHA256 using the `sha2` crate (fastest on CPUs with SHA extensions).
struct Sha2CrateImpl;
impl Sha256Context for sha2::Sha256 {
fn new() -> Self {
sha2::Digest::new()
}
fn update(&mut self, bytes: &[u8]) {
sha2::Digest::update(self, bytes)
}
fn finalize(self) -> [u8; HASH_LEN] {
sha2::Digest::finalize(self).into()
}
}
impl Sha256 for Sha2CrateImpl {
type Context = sha2::Sha256;
fn hash(&self, input: &[u8]) -> Vec<u8> {
Self::Context::digest(input).into_iter().collect()
}
fn hash_fixed(&self, input: &[u8]) -> [u8; HASH_LEN] {
Self::Context::digest(input).into()
}
}
/// Implementation of SHA256 using the `ring` crate (fastest on CPUs without SHA extensions).
pub struct RingImpl;
impl Sha256Context for ring::digest::Context {
fn new() -> Self {
Self::new(&ring::digest::SHA256)
}
fn update(&mut self, bytes: &[u8]) {
self.update(bytes)
}
fn finalize(self) -> [u8; HASH_LEN] {
let mut output = [0; HASH_LEN];
output.copy_from_slice(self.finish().as_ref());
output
}
}
impl Sha256 for RingImpl {
type Context = ring::digest::Context;
fn hash(&self, input: &[u8]) -> Vec<u8> {
ring::digest::digest(&ring::digest::SHA256, input)
.as_ref()
.into()
}
fn hash_fixed(&self, input: &[u8]) -> [u8; HASH_LEN] {
let mut ctxt = Self::Context::new(&ring::digest::SHA256);
ctxt.update(input);
ctxt.finalize()
}
}
/// Default dynamic implementation that switches between available implementations.
pub enum DynamicImpl {
Sha2,
Ring,
}
// Runtime latch for detecting the availability of SHA extensions on x86_64.
//
// Inspired by the runtime switch within the `sha2` crate itself.
#[cfg(target_arch = "x86_64")]
cpufeatures::new!(x86_sha_extensions, "sha", "sse2", "ssse3", "sse4.1");
#[inline(always)]
pub fn have_sha_extensions() -> bool {
#[cfg(target_arch = "x86_64")]
return x86_sha_extensions::get();
#[cfg(not(target_arch = "x86_64"))]
return false;
}
impl DynamicImpl {
/// Choose the best available implementation based on the currently executing CPU.
#[inline(always)]
pub fn best() -> Self {
if have_sha_extensions() {
Self::Sha2
} else {
Self::Ring
}
}
}
impl Sha256 for DynamicImpl {
type Context = DynamicContext;
#[inline(always)]
fn hash(&self, input: &[u8]) -> Vec<u8> {
match self {
Self::Sha2 => Sha2CrateImpl.hash(input),
Self::Ring => RingImpl.hash(input),
}
}
#[inline(always)]
fn hash_fixed(&self, input: &[u8]) -> [u8; HASH_LEN] {
match self {
Self::Sha2 => Sha2CrateImpl.hash_fixed(input),
Self::Ring => RingImpl.hash_fixed(input),
}
}
}
/// Context encapsulating all implemenation contexts.
///
/// This enum ends up being 8 bytes larger than the largest inner context.
pub enum DynamicContext {
Sha2(sha2::Sha256),
Ring(ring::digest::Context),
}
impl Sha256Context for DynamicContext {
fn new() -> Self {
match DynamicImpl::best() {
DynamicImpl::Sha2 => Self::Sha2(Sha256Context::new()),
DynamicImpl::Ring => Self::Ring(Sha256Context::new()),
}
}
fn update(&mut self, bytes: &[u8]) {
match self {
Self::Sha2(ctxt) => Sha256Context::update(ctxt, bytes),
Self::Ring(ctxt) => Sha256Context::update(ctxt, bytes),
}
}
fn finalize(self) -> [u8; HASH_LEN] {
match self {
Self::Sha2(ctxt) => Sha256Context::finalize(ctxt),
Self::Ring(ctxt) => Sha256Context::finalize(ctxt),
}
}
}
/// The max index that can be used with `ZERO_HASHES`.