polyval/backend/
clmul.rs

1//! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
2//! (i.e. Intel Sandy Bridge-compatible or newer)
3
4use crate::{Block, Key};
5use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash};
6
7#[cfg(target_arch = "x86")]
8use core::arch::x86::*;
9#[cfg(target_arch = "x86_64")]
10use core::arch::x86_64::*;
11
12/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
13#[derive(Clone)]
14pub struct Polyval {
15    h: __m128i,
16    y: __m128i,
17}
18
19impl NewUniversalHash for Polyval {
20    type KeySize = U16;
21
22    /// Initialize POLYVAL with the given `H` field element
23    fn new(h: &Key) -> Self {
24        unsafe {
25            // `_mm_loadu_si128` performs an unaligned load
26            #[allow(clippy::cast_ptr_alignment)]
27            Self {
28                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
29                y: _mm_setzero_si128(),
30            }
31        }
32    }
33}
34
35impl UniversalHash for Polyval {
36    type BlockSize = U16;
37
38    #[inline]
39    fn update(&mut self, x: &Block) {
40        unsafe {
41            self.mul(x);
42        }
43    }
44
45    /// Reset internal state
46    fn reset(&mut self) {
47        unsafe {
48            self.y = _mm_setzero_si128();
49        }
50    }
51
52    /// Get GHASH output
53    fn finalize(self) -> Output<Self> {
54        unsafe { core::mem::transmute(self.y) }
55    }
56}
57
58impl Polyval {
59    #[inline]
60    #[target_feature(enable = "pclmulqdq")]
61    #[target_feature(enable = "sse4.1")]
62    unsafe fn mul(&mut self, x: &Block) {
63        let h = self.h;
64
65        // `_mm_loadu_si128` performs an unaligned load
66        #[allow(clippy::cast_ptr_alignment)]
67        let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
68        let y = _mm_xor_si128(self.y, x);
69
70        let h0 = h;
71        let h1 = _mm_shuffle_epi32(h, 0x0E);
72        let h2 = _mm_xor_si128(h0, h1);
73        let y0 = y;
74
75        // Multiply values partitioned to 64-bit parts
76        let y1 = _mm_shuffle_epi32(y, 0x0E);
77        let y2 = _mm_xor_si128(y0, y1);
78        let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
79        let t1 = _mm_clmulepi64_si128(y, h, 0x11);
80        let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
81        let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
82        let v0 = t0;
83        let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
84        let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
85        let v3 = _mm_shuffle_epi32(t1, 0x0E);
86
87        // Polynomial reduction
88        let v2 = xor5(
89            v2,
90            v0,
91            _mm_srli_epi64(v0, 1),
92            _mm_srli_epi64(v0, 2),
93            _mm_srli_epi64(v0, 7),
94        );
95
96        let v1 = xor4(
97            v1,
98            _mm_slli_epi64(v0, 63),
99            _mm_slli_epi64(v0, 62),
100            _mm_slli_epi64(v0, 57),
101        );
102
103        let v3 = xor5(
104            v3,
105            v1,
106            _mm_srli_epi64(v1, 1),
107            _mm_srli_epi64(v1, 2),
108            _mm_srli_epi64(v1, 7),
109        );
110
111        let v2 = xor4(
112            v2,
113            _mm_slli_epi64(v1, 63),
114            _mm_slli_epi64(v1, 62),
115            _mm_slli_epi64(v1, 57),
116        );
117
118        self.y = _mm_unpacklo_epi64(v2, v3);
119    }
120}
121
122#[cfg(feature = "zeroize")]
123impl Drop for Polyval {
124    fn drop(&mut self) {
125        use zeroize::Zeroize;
126        self.h.zeroize();
127        self.y.zeroize();
128    }
129}
130
131#[inline(always)]
132unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i {
133    _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4))
134}
135
136#[inline(always)]
137unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i {
138    _mm_xor_si128(
139        e1,
140        _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)),
141    )
142}