diff --git a/curve25519-dalek-derive/tests/tests.rs b/curve25519-dalek-derive/tests/tests.rs index 204ce158d..d5eacc8df 100644 --- a/curve25519-dalek-derive/tests/tests.rs +++ b/curve25519-dalek-derive/tests/tests.rs @@ -104,18 +104,35 @@ mod inner_spec { #[for_target_feature("avx2")] const IS_AVX2: bool = true; + #[for_target_feature("sse2")] #[test] fn test_specialized() { assert!(!IS_AVX2); } + #[for_target_feature("avx2")] + #[test] + fn test_specialized_avx2() { + assert!(IS_AVX2); + } + #[cfg(test)] + #[for_target_feature("sse2")] mod tests { #[test] fn test_specialized_inner() { assert!(!super::IS_AVX2); } } + + #[cfg(test)] + #[for_target_feature("avx2")] + mod tests_avx2 { + #[test] + fn test_specialized_inner_avx2() { + assert!(super::IS_AVX2); + } + } } #[unsafe_target_feature("sse2")] @@ -127,9 +144,17 @@ fn test_sse2_only() {} // pretty esoteric feature. Looking at the table of supported avx512 features at // https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512 it seems avx512vp2intersect is one of the // most unusual ones that has rustc knows about +#[cfg(target_feature = "avx512vp2intersect")] #[unsafe_target_feature("avx512vp2intersect")] #[test] fn test_unset_target_feature() { + assert!(std::arch::is_x86_feature_detected!("avx512vp2intersect")); +} + +#[cfg(not(target_feature = "avx512vp2intersect"))] +#[unsafe_target_feature("avx512vp2intersect")] +#[test] +fn test_unset_target_feature_removed() { compile_error!("When an unknown target_feature is set on a test, unsafe_target_feature is expected remove the function"); } diff --git a/curve25519-dalek/benches/dalek_benchmarks.rs b/curve25519-dalek/benches/dalek_benchmarks.rs index b8bdd6772..48488128d 100644 --- a/curve25519-dalek/benches/dalek_benchmarks.rs +++ b/curve25519-dalek/benches/dalek_benchmarks.rs @@ -74,6 +74,38 @@ mod edwards_benches { }); } + fn vartime_triple_base_scalar_mul_128(c: &mut BenchmarkGroup) { + c.bench_function("Variable-time a1*A1+a2*A2+b*B (128-bit a1,a2)", |bench| { + let mut rng = rng(); + let A1 = EdwardsPoint::mul_base(&Scalar::random(&mut rng)); + let A2 = EdwardsPoint::mul_base(&Scalar::random(&mut rng)); + + bench.iter_batched( + || { + // Generate 128-bit scalars for a1 and a2 + let mut a1_bytes = [0u8; 32]; + let mut a2_bytes = [0u8; 32]; + + // Fill lower 16 bytes with random data, upper 16 bytes are zero + for i in 0..16 { + a1_bytes[i] = rng.next_u32() as u8; + a2_bytes[i] = rng.next_u32() as u8; + } + + let a1 = Scalar::from_bytes_mod_order(a1_bytes); + let a2 = Scalar::from_bytes_mod_order(a2_bytes); + let b = Scalar::random(&mut rng); + + (a1, a2, b) + }, + |(a1, a2, b)| { + EdwardsPoint::vartime_triple_scalar_mul_basepoint(&a1, &A1, &a2, &A2, &b) + }, + BatchSize::SmallInput, + ); + }); + } + #[cfg(feature = "digest")] fn encode_to_curve(c: &mut BenchmarkGroup) { let mut rng = rng(); @@ -115,6 +147,7 @@ mod edwards_benches { consttime_fixed_base_scalar_mul(&mut g); consttime_variable_base_scalar_mul(&mut g); vartime_double_base_scalar_mul(&mut g); + vartime_triple_base_scalar_mul_128(&mut g); encode_to_curve(&mut g); hash_to_curve(&mut g); } diff --git a/curve25519-dalek/docs/parallel-formulas.md b/curve25519-dalek/docs/parallel-formulas.md index 70aadc38c..03d5e8387 100644 --- a/curve25519-dalek/docs/parallel-formulas.md +++ b/curve25519-dalek/docs/parallel-formulas.md @@ -219,7 +219,7 @@ element vectors, whose optimum choice is determined by the details of the instruction set. However, it's not possible to perfectly separate the implementation of the field element vectors from the implementation of the point operations. Instead, the [`avx2`] and -[`ifma`] backends provide `ExtendedPoint` and `CachedPoint` types, and +`ifma` backends provide `ExtendedPoint` and `CachedPoint` types, and the [`scalar_mul`] code uses one of the backend types by a type alias. # Comparison to non-vectorized formulas diff --git a/curve25519-dalek/src/backend.rs b/curve25519-dalek/src/backend.rs index cfb8b003f..7194181d5 100644 --- a/curve25519-dalek/src/backend.rs +++ b/curve25519-dalek/src/backend.rs @@ -275,3 +275,31 @@ pub fn vartime_double_base_mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> Edwa BackendKind::Serial => serial::scalar_mul::vartime_double_base::mul(a, A, b), } } + +/// Compute \\(a_1 A_1 + a_2 A_2 + b B\\) in variable time, where \\(B\\) is the Ed25519 basepoint. +/// +/// This function is optimized for the case where \\(a_1\\) and \\(a_2\\) are less than \\(2^{128}\\). +#[allow(non_snake_case)] +pub fn vartime_triple_base_mul_128_128_256( + a1: &Scalar, + A1: &EdwardsPoint, + a2: &Scalar, + A2: &EdwardsPoint, + b: &Scalar, +) -> EdwardsPoint { + match get_selected_backend() { + #[cfg(curve25519_dalek_backend = "simd")] + BackendKind::Avx2 => { + vector::scalar_mul::vartime_triple_base::spec_avx2::mul_128_128_256(a1, A1, a2, A2, b) + } + #[cfg(all(curve25519_dalek_backend = "unstable_avx512", nightly))] + BackendKind::Avx512 => { + vector::scalar_mul::vartime_triple_base::spec_avx512ifma_avx512vl::mul_128_128_256( + a1, A1, a2, A2, b, + ) + } + BackendKind::Serial => { + serial::scalar_mul::vartime_triple_base::mul_128_128_256(a1, A1, a2, A2, b) + } + } +} diff --git a/curve25519-dalek/src/backend/serial/scalar_mul.rs b/curve25519-dalek/src/backend/serial/scalar_mul.rs index 7747decc3..9d9ba3376 100644 --- a/curve25519-dalek/src/backend/serial/scalar_mul.rs +++ b/curve25519-dalek/src/backend/serial/scalar_mul.rs @@ -23,6 +23,9 @@ pub mod variable_base; #[allow(missing_docs)] pub mod vartime_double_base; +#[allow(missing_docs)] +pub mod vartime_triple_base; + #[cfg(feature = "alloc")] pub mod straus; diff --git a/curve25519-dalek/src/backend/serial/scalar_mul/vartime_triple_base.rs b/curve25519-dalek/src/backend/serial/scalar_mul/vartime_triple_base.rs new file mode 100644 index 000000000..72c681d68 --- /dev/null +++ b/curve25519-dalek/src/backend/serial/scalar_mul/vartime_triple_base.rs @@ -0,0 +1,326 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// Copyright (c) 2016-2021 isis lovecruft +// Copyright (c) 2016-2019 Henry de Valence +// See LICENSE for licensing information. + +#![allow(non_snake_case)] + +use core::cmp::Ordering; + +use crate::backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint}; +use crate::constants; +use crate::edwards::EdwardsPoint; +use crate::scalar::{HEEA_MAX_INDEX, Scalar}; +use crate::traits::Identity; +use crate::window::NafLookupTable5; + +/// Compute \\(a_1 A_1 + a_2 A_2 + b B\\) in variable time, where \\(B\\) is the Ed25519 basepoint. +/// +/// This function is optimized for the case where \\(a_1\\) and \\(a_2\\) are known to be less than +/// \\(2^{128}\\), while \\(b\\) is a full 256-bit scalar. +/// +/// # Optimization Strategy +/// +/// The function decomposes the 256-bit scalar \\(b\\) as \\(b = b_{lo} + b_{hi} \cdot 2^{128}\\), +/// where both \\(b_{lo}\\) and \\(b_{hi}\\) are 128-bit values. This allows computing: +/// +/// \\[ +/// a_1 A_1 + a_2 A_2 + b_{lo} B + b_{hi} B' +/// \\] +/// +/// where \\(B' = B \cdot 2^{128}\\) is a precomputed constant. Now all four scalars +/// (\\(a_1, a_2, b_{lo}, b_{hi}\\)) are 128 bits, and two of the bases (\\(B\\) and \\(B'\\)) +/// use precomputed tables. +/// +/// # Implementation +/// +/// - For \\(A_1\\) and \\(A_2\\): NAF with window width 5 (8 precomputed points each) +/// - For \\(B\\): NAF with window width 8 when precomputed tables available (64 points) +/// - For \\(B'\\): NAF with window width 5 (could be optimized with precomputed table) +/// +/// The algorithm shares doublings across all four scalar multiplications, processing +/// only 128 bits instead of 256, providing approximately 2x speedup over the naive approach. +pub fn mul_128_128_256( + a1: &Scalar, + A1: &EdwardsPoint, + a2: &Scalar, + A2: &EdwardsPoint, + b: &Scalar, +) -> EdwardsPoint { + // assert that a1 and a2 are less than 2^128 + debug_assert!(a1.as_bytes()[16..32].iter().all(|&b| b == 0)); + debug_assert!(a2.as_bytes()[16..32].iter().all(|&b| b == 0)); + + // Decompose b into b_lo (lower 128 bits) and b_hi (upper 128 bits) + // b = b_lo + b_hi * 2^128 + let b_bytes = b.as_bytes(); + + let mut b_lo_bytes = [0u8; 32]; + let mut b_hi_bytes = [0u8; 32]; + + // Copy lower 16 bytes to b_lo, upper 16 bytes to b_hi + b_lo_bytes[..16].copy_from_slice(&b_bytes[..16]); + b_hi_bytes[..16].copy_from_slice(&b_bytes[16..]); + + let b_lo = Scalar::from_canonical_bytes(b_lo_bytes).unwrap(); + let b_hi = Scalar::from_canonical_bytes(b_hi_bytes).unwrap(); + + // Compute NAF representations (all scalars are now ~128 bits) + let a1_naf = a1.non_adjacent_form(5); + let a2_naf = a2.non_adjacent_form(5); + let b_lo_naf = b_lo.non_adjacent_form(5); + let b_hi_naf = b_hi.non_adjacent_form(5); + + // Find starting index - check all NAFs up to bit 127 + // (with potential carry to bit 128 or 129) + let mut i: usize = HEEA_MAX_INDEX; + for j in (0..=HEEA_MAX_INDEX).rev() { + i = j; + if a1_naf[i] != 0 || a2_naf[i] != 0 || b_lo_naf[i] != 0 || b_hi_naf[i] != 0 { + break; + } + } + + // Create lookup tables + let table_A1 = NafLookupTable5::::from(A1); + let table_A2 = NafLookupTable5::::from(A2); + + #[cfg(feature = "precomputed-tables")] + let table_B = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT; + #[cfg(not(feature = "precomputed-tables"))] + let table_B = + &NafLookupTable5::::from(&constants::ED25519_BASEPOINT_POINT); + + // B' = B * 2^128 (precomputed constant point) + #[cfg(feature = "precomputed-tables")] + let table_B_128 = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT_128; + #[cfg(not(feature = "precomputed-tables"))] + let table_B_128 = + &NafLookupTable5::::from(&constants::ED25519_BASEPOINT_128_POINT); + + let mut r = ProjectivePoint::identity(); + + loop { + let mut t = r.double(); + + // Add contributions from a1*A1 + match a1_naf[i].cmp(&0) { + Ordering::Greater => t = &t.as_extended() + &table_A1.select(a1_naf[i] as usize), + Ordering::Less => t = &t.as_extended() - &table_A1.select(-a1_naf[i] as usize), + Ordering::Equal => {} + } + + // Add contributions from a2*A2 + match a2_naf[i].cmp(&0) { + Ordering::Greater => t = &t.as_extended() + &table_A2.select(a2_naf[i] as usize), + Ordering::Less => t = &t.as_extended() - &table_A2.select(-a2_naf[i] as usize), + Ordering::Equal => {} + } + + // Add contributions from b_lo*B + match b_lo_naf[i].cmp(&0) { + Ordering::Greater => t = &t.as_extended() + &table_B.select(b_lo_naf[i] as usize), + Ordering::Less => t = &t.as_extended() - &table_B.select(-b_lo_naf[i] as usize), + Ordering::Equal => {} + } + + // Add contributions from b_hi*B' where B' = B * 2^128 + match b_hi_naf[i].cmp(&0) { + Ordering::Greater => t = &t.as_extended() + &table_B_128.select(b_hi_naf[i] as usize), + Ordering::Less => t = &t.as_extended() - &table_B_128.select(-b_hi_naf[i] as usize), + Ordering::Equal => {} + } + + r = t.as_projective(); + + if i == 0 { + break; + } + i -= 1; + } + + r.as_extended() +} + +#[cfg(test)] +mod test { + + use rand::{RngCore, rng}; + + use super::*; + use crate::scalar::Scalar; + + fn random_scalar() -> Scalar { + let mut wide = [0u8; 64]; + let mut rng = rng(); + rng.fill_bytes(&mut wide); + Scalar::from_bytes_mod_order_wide(&wide) + } + + #[test] + fn test_triple_base_multiplication() { + // Test vectors with random scalars + let a1 = Scalar::from(12345u64); + let a2 = Scalar::from(67890u64); + let b = random_scalar(); + + // Random points (using scalar multiplication of basepoint) + let A1 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(2u64); + let A2 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(3u64); + + // Compute using the optimized triple-base function + let result = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + + // Compute using naive addition + let expected = &(&(&a1 * &A1) + &(&a2 * &A2)) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + assert_eq!(result, expected); + } + + #[test] + fn test_triple_base_multiplication_128() { + // Test with 128-bit scalars for a1 and a2 + // Create a 128-bit scalar (use lower half) + let a1_bytes = [ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + ]; + let a1 = Scalar::from_bytes_mod_order(a1_bytes); + + let a2_bytes = [ + 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x77, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + ]; + let a2 = Scalar::from_bytes_mod_order(a2_bytes); + + // Full 256-bit scalar for b + let b = random_scalar(); + + // Test points + let A1 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(5u64); + let A2 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(7u64); + + // Test the optimized 128-bit version + let result_128 = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + + // Compute expected result + let expected = &(&(&a1 * &A1) + &(&a2 * &A2)) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + assert_eq!(result_128, expected, "Optimized 128-bit version failed"); + } + + #[test] + fn test_triple_base_with_zero_scalars() { + let a1 = Scalar::ZERO; + let a2 = Scalar::from(123u64); + let b = random_scalar(); + + let A1 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(2u64); + let A2 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(3u64); + + let result = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + let expected = &(&a2 * &A2) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + assert_eq!(result, expected); + } + + #[test] + fn test_triple_base_with_identity_points() { + let a1 = Scalar::from(111u64); + let a2 = Scalar::from(222u64); + let b = random_scalar(); + + let A1 = EdwardsPoint::identity(); + let A2 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(3u64); + + let result = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + let expected = &(&a2 * &A2) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + assert_eq!(result, expected); + } + + #[test] + fn test_triple_base_consistency() { + // Test that both functions give the same result for 128-bit inputs + let a1 = Scalar::from(0x123456789ABCDEFu64); + let a2 = Scalar::from(0xFEDCBA987654321u64); + let b = random_scalar(); + + let A1 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(11u64); + let A2 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(13u64); + + let result_optimized = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + let result_general = + &(&(&a1 * &A1) + &(&a2 * &A2)) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + assert_eq!(result_optimized, result_general); + } + + #[test] + fn test_triple_base_large_scalars() { + // Test with large scalars + let mut a1_bytes = [0xFFu8; 32]; + for i in 16..32 { + a1_bytes[i] = 0x00; + } + let a1 = Scalar::from_bytes_mod_order(a1_bytes); + + let mut a2_bytes = [0xAAu8; 32]; + for i in 16..32 { + a2_bytes[i] = 0x00; + } + let a2 = Scalar::from_bytes_mod_order(a2_bytes); + + let b = random_scalar(); + + let A1 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(17u64); + let A2 = &constants::ED25519_BASEPOINT_POINT * &Scalar::from(19u64); + + let result = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + let expected = &(&(&a1 * &A1) + &(&a2 * &A2)) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + assert_eq!(result, expected); + } + + // Proptest for vartime_triple_scalar_mul_basepoint equivalence + proptest::proptest! { + #[test] + fn proptest_triple_scalar_mul_equivalence( + a1_bytes_16 in proptest::array::uniform16(proptest::num::u8::ANY), + a2_bytes_16 in proptest::array::uniform16(proptest::num::u8::ANY), + b_bytes in proptest::array::uniform32(proptest::num::u8::ANY), + A1_scalar_bytes in proptest::array::uniform32(proptest::num::u8::ANY), + A2_scalar_bytes in proptest::array::uniform32(proptest::num::u8::ANY), + ) { + // Construct 128-bit scalars a1 and a2 (upper 16 bytes are zero) + let mut a1_bytes = [0u8; 32]; + let mut a2_bytes = [0u8; 32]; + a1_bytes[..16].copy_from_slice(&a1_bytes_16); + a2_bytes[..16].copy_from_slice(&a2_bytes_16); + + let a1 = Scalar::from_bytes_mod_order(a1_bytes); + let a2 = Scalar::from_bytes_mod_order(a2_bytes); + + // Construct full 256-bit scalar b + let b = Scalar::from_bytes_mod_order(b_bytes); + + // Generate random points A1 and A2 using scalar multiplication of basepoint + let A1_scalar = Scalar::from_bytes_mod_order(A1_scalar_bytes); + let A2_scalar = Scalar::from_bytes_mod_order(A2_scalar_bytes); + let A1 = &constants::ED25519_BASEPOINT_POINT * &A1_scalar; + let A2 = &constants::ED25519_BASEPOINT_POINT * &A2_scalar; + + // Compute using the optimized triple-base function + let result_optimized = mul_128_128_256(&a1, &A1, &a2, &A2, &b); + + // Compute using raw operations: a1*A1 + a2*A2 + b*B + let expected = &(&(&a1 * &A1) + &(&a2 * &A2)) + &(&b * &constants::ED25519_BASEPOINT_POINT); + + proptest::prop_assert_eq!(result_optimized, expected, "Optimized triple scalar mul should equal raw operations"); + } + } +} diff --git a/curve25519-dalek/src/backend/serial/u32/constants.rs b/curve25519-dalek/src/backend/serial/u32/constants.rs index ff6e4ec37..c7e8f810e 100644 --- a/curve25519-dalek/src/backend/serial/u32/constants.rs +++ b/curve25519-dalek/src/backend/serial/u32/constants.rs @@ -19,9 +19,9 @@ use crate::edwards::EdwardsPoint; #[cfg(feature = "precomputed-tables")] use crate::{ - backend::serial::curve_models::AffineNielsPoint, + backend::serial::curve_models::{AffineNielsPoint, ProjectiveNielsPoint}, edwards::EdwardsBasepointTable, - window::{LookupTable, NafLookupTable8}, + window::{LookupTable, NafLookupTable5, NafLookupTable8}, }; /// The value of minus one, equal to `-&FieldElement::ONE` @@ -136,6 +136,48 @@ pub const ED25519_BASEPOINT_POINT: EdwardsPoint = EdwardsPoint { ]), }; +/// The Ed25519 basepoint, mul by 2^128, as an `EdwardsPoint`. +pub const ED25519_BASEPOINT_128_POINT: EdwardsPoint = EdwardsPoint { + X: FieldElement2625::from_limbs([ + 2664042, 23449881, 8588504, 31570262, 52025907, 14016958, 17934911, 10536770, 36081707, + 18715816, + ]), + Y: FieldElement2625::from_limbs([ + 53612635, 17322216, 64979144, 12220533, 27384794, 7796776, 63981171, 31808137, 3318544, + 10876052, + ]), + Z: FieldElement2625::from_limbs([ + 38318927, 6633020, 30360108, 27133620, 43190211, 599215, 50990868, 21586734, 34463843, + 14390137, + ]), + T: FieldElement2625::from_limbs([ + 46012201, 27645749, 48994527, 27092089, 44549182, 4023192, 8388284, 20428666, 53367776, + 2097936, + ]), +}; + +#[cfg(all(test, feature = "precomputed-tables"))] +mod tests { + use super::*; + use crate::window::NafLookupTable5; + + #[test] + fn basepoint_128_table_matches_generated() { + let generated = NafLookupTable5::::from(&ED25519_BASEPOINT_128_POINT); + + for (expected, actual) in AFFINE_ODD_MULTIPLES_OF_BASEPOINT_128 + .0 + .iter() + .zip(generated.0.iter()) + { + assert_eq!(expected.Y_plus_X, actual.Y_plus_X); + assert_eq!(expected.Y_minus_X, actual.Y_minus_X); + assert_eq!(expected.Z, actual.Z); + assert_eq!(expected.T2d, actual.T2d); + } + } +} + /// The 8-torsion subgroup \\(\mathcal E \[8\]\\). /// /// In the case of Curve25519, it is cyclic; the \\(i\\)-th element of @@ -4808,3 +4850,155 @@ pub(crate) const AFFINE_ODD_MULTIPLES_OF_BASEPOINT: NafLookupTable8 = + NafLookupTable5([ + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 56276677, 40772097, 73567648, 43790795, 79410701, 21813734, 81916082, 42344907, + 39400251, 29591868, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 50948574, 27426767, 56390639, 14204703, 42467750, 27334249, 46046259, 21271367, + 34345701, 25714667, + ]), + Z: FieldElement2625::from_limbs([ + 38318927, 6633020, 30360108, 27133620, 43190211, 599215, 50990868, 21586734, + 34463843, 14390137, + ]), + T2d: FieldElement2625::from_limbs([ + 55001814, 11526031, 29172528, 19702978, 57194240, 33496133, 28505201, 6534991, + 35091584, 29849501, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 61573526, 60394104, 66658949, 41415014, 72576091, 30187734, 76592377, 63167873, + 101619296, 30513334, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 51181788, 696007, 24915963, 12735707, 2911894, 7060820, 64624395, 1392014, 2117242, + 549672, + ]), + Z: FieldElement2625::from_limbs([ + 1375370, 21089372, 48301986, 8317127, 40530149, 15040395, 40784256, 3020215, + 2680506, 21416503, + ]), + T2d: FieldElement2625::from_limbs([ + 20572821, 11904555, 50242953, 31126781, 63713506, 29202792, 58335501, 11712060, + 32040945, 28287357, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 63997925, 22141397, 83453199, 52824554, 78447079, 49717694, 122013047, 24982068, + 56606655, 58476714, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 25892846, 11528509, 46114731, 26269695, 31949658, 18508240, 8742696, 14557236, + 16402355, 26155200, + ]), + Z: FieldElement2625::from_limbs([ + 38457063, 5424669, 61803693, 25961913, 42758992, 27223402, 64723485, 24042834, + 60529696, 25431727, + ]), + T2d: FieldElement2625::from_limbs([ + 16905521, 9132640, 31799824, 27591686, 39867108, 6595207, 56652308, 20125311, + 25159658, 2078150, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 85244803, 20390705, 67880180, 3834008, 62046955, 32059486, 53528634, 61952242, + 53903557, 36237664, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 51282070, 16196724, 13662050, 32134248, 30369654, 19444710, 35256476, 33331300, + 59297746, 14250525, + ]), + Z: FieldElement2625::from_limbs([ + 3445791, 21391121, 34232242, 31943518, 6858093, 4243898, 19125331, 22740977, + 66125843, 17090801, + ]), + T2d: FieldElement2625::from_limbs([ + 33365109, 11130954, 12461906, 19758375, 47688325, 30792256, 40299724, 5246188, + 44473969, 15349344, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 110937413, 48311921, 56664014, 24665671, 117342840, 14590869, 65779056, 33374083, + 47093275, 38469648, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 11795097, 23045357, 37986619, 25517870, 61752555, 20274894, 5272019, 20059223, + 35147576, 20606386, + ]), + Z: FieldElement2625::from_limbs([ + 57507727, 19391019, 38433125, 11242927, 27002606, 28894320, 56608229, 12332442, + 8089695, 13073249, + ]), + T2d: FieldElement2625::from_limbs([ + 37741271, 17839496, 20586535, 9831905, 18983461, 25508586, 63411317, 8323540, + 38457441, 1489950, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 22997800, 33050677, 60441767, 41527662, 30621381, 4134210, 108906708, 35532623, + 58504533, 33872302, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 6811554, 1638711, 35767789, 14166397, 19866339, 260838, 19580826, 7806685, + 21147024, 11184764, + ]), + Z: FieldElement2625::from_limbs([ + 56686189, 817943, 57332738, 26228354, 5008852, 4939604, 43684796, 24536787, + 9480743, 15490942, + ]), + T2d: FieldElement2625::from_limbs([ + 64866667, 25959608, 1287178, 28622686, 28996196, 27179453, 29156417, 122723, + 319481, 12947309, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 88952292, 52909474, 73631435, 34809825, 64421577, 23324883, 98191597, 46736505, + 56269697, 27274610, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 36626131, 26445435, 43443322, 31269185, 4788786, 21966751, 10657839, 11622879, + 5076135, 12024398, + ]), + Z: FieldElement2625::from_limbs([ + 27574083, 30265038, 50965871, 383090, 29024858, 20875059, 21323131, 11150207, + 30687572, 18127668, + ]), + T2d: FieldElement2625::from_limbs([ + 11705253, 28151182, 52953365, 25485653, 22768936, 30893323, 62228850, 15557152, + 13370370, 11519727, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement2625::from_limbs([ + 47704924, 31694873, 47305006, 31556775, 111862751, 53310043, 92993662, 39813534, + 67486461, 60545321, + ]), + Y_minus_X: FieldElement2625::from_limbs([ + 4781635, 20898487, 30324746, 31566849, 66314586, 2020338, 46386772, 13303771, + 3604654, 31609762, + ]), + Z: FieldElement2625::from_limbs([ + 62029700, 23518227, 27988369, 32135885, 45236203, 32920151, 9386636, 4293881, + 41034217, 10875575, + ]), + T2d: FieldElement2625::from_limbs([ + 55189462, 23119455, 46083877, 20752941, 52953876, 30458463, 26312358, 17747216, + 58369047, 6595265, + ]), + }, + ]); diff --git a/curve25519-dalek/src/backend/serial/u64/constants.rs b/curve25519-dalek/src/backend/serial/u64/constants.rs index cd27b8ff2..99698ee79 100644 --- a/curve25519-dalek/src/backend/serial/u64/constants.rs +++ b/curve25519-dalek/src/backend/serial/u64/constants.rs @@ -17,9 +17,9 @@ use crate::edwards::EdwardsPoint; #[cfg(feature = "precomputed-tables")] use crate::{ - backend::serial::curve_models::AffineNielsPoint, + backend::serial::curve_models::{AffineNielsPoint, ProjectiveNielsPoint}, edwards::EdwardsBasepointTable, - window::{LookupTable, NafLookupTable8}, + window::{LookupTable, NafLookupTable5, NafLookupTable8}, }; /// The value of minus one, equal to `-&FieldElement::ONE` @@ -185,6 +185,38 @@ pub const ED25519_BASEPOINT_POINT: EdwardsPoint = EdwardsPoint { ]), }; +/// The Ed25519 basepoint, mul by 2^128, as an `EdwardsPoint`. +pub const ED25519_BASEPOINT_128_POINT: EdwardsPoint = EdwardsPoint { + X: FieldElement51::from_limbs([ + 1573694877509226, + 2118644427590872, + 940662180141619, + 707110682864191, + 1255997186674731, + ]), + Y: FieldElement51::from_limbs([ + 1162474291335259, + 820106152083656, + 523232807607258, + 2134608004007539, + 729879497843472, + ]), + Z: FieldElement51::from_limbs([ + 445134475408207, + 1820906444767788, + 40212681131971, + 1448661247201044, + 965705781338211, + ]), + T: FieldElement51::from_limbs([ + 1855274855831337, + 1818119365171423, + 269991889323070, + 1370944576683708, + 140790155072480, + ]), +}; + /// The 8-torsion subgroup \\(\mathcal E \[8\]\\). /// /// In the case of Curve25519, it is cyclic; the \\(i\\)-th element of @@ -7779,3 +7811,251 @@ pub(crate) const AFFINE_ODD_MULTIPLES_OF_BASEPOINT: NafLookupTable8 = + NafLookupTable5([ + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 2736169168844485, + 2938750579674528, + 1463894987748877, + 2841718686871730, + 1985876684518203, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 1840579227511262, + 953261538178031, + 1834370441150886, + 1427497321143347, + 1725682124853989, + ]), + Z: FieldElement51::from_limbs([ + 445134475408207, + 1820906444767788, + 40212681131971, + 1448661247201044, + 965705781338211, + ]), + T2d: FieldElement51::from_limbs([ + 773498901840598, + 1322244500169520, + 2247887491217152, + 438555850765425, + 2003166138168448, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 4052979773311382, + 2779314608743045, + 2025864608050267, + 4239124274918649, + 2047715283211872, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 46708290287836, + 854678853922811, + 473843612020374, + 93416542836491, + 36887865609850, + ]), + Z: FieldElement51::from_limbs([ + 1415283798768778, + 558152993015714, + 1009343863091429, + 202683238470016, + 1437237189863098, + ]), + T2d: FieldElement51::from_limbs([ + 798901183048341, + 2088882963129737, + 1959766260461794, + 785983100035341, + 1898332425873393, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 1485884064040933, + 3544995893699855, + 3336498043486695, + 1676518325863799, + 3924305903599551, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 773665168496622, + 1762929435191211, + 1242066992989018, + 976919579682600, + 1755245776095155, + ]), + Z: FieldElement51::from_limbs([ + 364043412623079, + 1742274550500525, + 1826931625194320, + 1613487341804061, + 1706694369057824, + ]), + T2d: FieldElement51::from_limbs([ + 612881112626481, + 1851646735104528, + 442596889481956, + 1350586815509012, + 139462310881258, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 1368397133953923, + 257295989327092, + 2151475747930859, + 4157544636401722, + 2431868518957253, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 1086943799443606, + 2156492892436322, + 1304912429279094, + 2236825713899676, + 956336603451346, + ]), + Z: FieldElement51::from_limbs([ + 1435533833442335, + 2143693239375794, + 284803180569965, + 1526121151845459, + 1146944306085907, + ]), + T2d: FieldElement51::from_limbs([ + 746985711541365, + 1325962113197906, + 2066433367845509, + 352065757310156, + 1030077083459185, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 3242158246905157, + 1655285217271758, + 979176760705656, + 2239696862950768, + 2581654422853147, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 1546547740539545, + 1712475305386299, + 1360625165812971, + 1346151673524691, + 1382871190753080, + ]), + Z: FieldElement51::from_limbs([ + 1301309314400143, + 754500097438053, + 1939065018255086, + 827616229574117, + 877330897268831, + ]), + T2d: FieldElement51::from_limbs([ + 1197188348633815, + 659807996092455, + 1711852247689765, + 558583377269877, + 99988890374241, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 2217993410898728, + 2786874281837735, + 277442167258821, + 2384554073376980, + 2273131766789461, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 109972040445858, + 950690845410797, + 17504561734371, + 523897781536666, + 750596827295120, + ]), + Z: FieldElement51::from_limbs([ + 54891282232941, + 1760155098862594, + 331491218058708, + 1646635945464764, + 1039579529390631, + ]), + T2d: FieldElement51::from_limbs([ + 1742119867631979, + 1920835943375882, + 1823982243967588, + 8235830273089, + 868879199166457, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 3550694783929828, + 2336047885420235, + 1565306465484489, + 3136433856071917, + 1830368149412737, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 1774723137461971, + 2098439526999162, + 1474163710169650, + 779998216757295, + 806943695140007, + ]), + Z: FieldElement51::from_limbs([ + 2031052346670915, + 25708785675631, + 1400901524447834, + 748277746457979, + 1216527237136724, + ]), + T2d: FieldElement51::from_limbs([ + 1889193855982501, + 1710313274081557, + 2073215834484008, + 1044022860024178, + 773075805930498, + ]), + }, + ProjectiveNielsPoint { + Y_plus_X: FieldElement51::from_limbs([ + 2127006969359196, + 2117739369058606, + 3577576537383903, + 2671841131559038, + 4063127780311805, + ]), + Y_minus_X: FieldElement51::from_limbs([ + 1402473726670403, + 2118415406774282, + 135582654390618, + 892801005112916, + 2121295222735022, + ]), + Z: FieldElement51::from_limbs([ + 1578281559293828, + 2156602763973009, + 2209233981554667, + 288157485447820, + 729847524631017, + ]), + T2d: FieldElement51::from_limbs([ + 1551520416538582, + 1392706341252901, + 2044032904069908, + 1190995531234982, + 442600800298007, + ]), + }, + ]); diff --git a/curve25519-dalek/src/backend/vector/scalar_mul.rs b/curve25519-dalek/src/backend/vector/scalar_mul.rs index fed3470e7..d544cb43d 100644 --- a/curve25519-dalek/src/backend/vector/scalar_mul.rs +++ b/curve25519-dalek/src/backend/vector/scalar_mul.rs @@ -17,6 +17,9 @@ pub mod variable_base; #[allow(missing_docs)] pub mod vartime_double_base; +#[allow(missing_docs)] +pub mod vartime_triple_base; + #[allow(missing_docs)] #[cfg(feature = "alloc")] pub mod straus; diff --git a/curve25519-dalek/src/backend/vector/scalar_mul/vartime_triple_base.rs b/curve25519-dalek/src/backend/vector/scalar_mul/vartime_triple_base.rs new file mode 100644 index 000000000..dae44af2b --- /dev/null +++ b/curve25519-dalek/src/backend/vector/scalar_mul/vartime_triple_base.rs @@ -0,0 +1,188 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// Copyright (c) 2016-2021 isis lovecruft +// Copyright (c) 2016-2019 Henry de Valence +// See LICENSE for licensing information. + +#![allow(non_snake_case)] + +#[curve25519_dalek_derive::unsafe_target_feature_specialize( + "avx2", + conditional( + "avx512ifma,avx512vl", + all(curve25519_dalek_backend = "unstable_avx512", nightly) + ) +)] +pub mod spec { + + use core::cmp::Ordering; + + #[for_target_feature("avx2")] + use crate::backend::vector::avx2::{CachedPoint, ExtendedPoint}; + + #[for_target_feature("avx512ifma")] + use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; + + #[cfg(feature = "precomputed-tables")] + #[for_target_feature("avx2")] + use crate::backend::vector::avx2::constants::BASEPOINT_ODD_LOOKUP_TABLE; + + #[for_target_feature("avx512ifma")] + use crate::backend::vector::ifma::constants::BASEPOINT_ODD_LOOKUP_TABLE; + + use crate::constants; + use crate::edwards::EdwardsPoint; + use crate::scalar::HEEA_MAX_INDEX; + use crate::scalar::Scalar; + #[allow(unused_imports)] + use crate::traits::Identity; + use crate::window::NafLookupTable5; + + /// Compute \\(a_1 A_1 + a_2 A_2 + b B\\) in variable time, where \\(B\\) is the Ed25519 basepoint. + /// + /// This function is optimized for the case where \\(a_1\\) and \\(a_2\\) are known to be less than + /// \\(2^{128}\\), while \\(b\\) is a full 256-bit scalar. + /// + /// # Optimization Strategy + /// + /// The function decomposes the 256-bit scalar \\(b\\) as \\(b = b_{lo} + b_{hi} \cdot 2^{128}\\), + /// where both \\(b_{lo}\\) and \\(b_{hi}\\) are 128-bit values. This allows computing: + /// + /// \\[ + /// a_1 A_1 + a_2 A_2 + b_{lo} B + b_{hi} B' + /// \\] + /// + /// where \\(B' = B \cdot 2^{128}\\) is a precomputed constant. Now all four scalars + /// (\\(a_1, a_2, b_{lo}, b_{hi}\\)) are 128 bits, and two of the bases (\\(B\\) and \\(B'\\)) + /// use precomputed tables. + /// + /// # Implementation + /// + /// - For \\(A_1\\) and \\(A_2\\): NAF with window width 5 (8 precomputed points each) + /// - For \\(B\\): NAF with window width 8 when precomputed tables available (64 points), otherwise width 5 + /// - For \\(B'\\): NAF with window width 5 + /// + /// The algorithm shares doublings across all four scalar multiplications, processing + /// only 128 bits instead of 256, providing approximately 2x speedup over the naive approach. + /// + /// This SIMD implementation uses vectorized point operations (AVX2 or AVX512-IFMA) for + /// improved performance over the serial backend. + pub fn mul_128_128_256( + a1: &Scalar, + A1: &EdwardsPoint, + a2: &Scalar, + A2: &EdwardsPoint, + b: &Scalar, + ) -> EdwardsPoint { + // assert that a1 and a2 are less than 2^128 + debug_assert!(a1.as_bytes()[16..32].iter().all(|&b| b == 0)); + debug_assert!(a2.as_bytes()[16..32].iter().all(|&b| b == 0)); + + // Decompose b into b_lo (lower 128 bits) and b_hi (upper 128 bits) + // b = b_lo + b_hi * 2^128 + let b_bytes = b.as_bytes(); + + let mut b_lo_bytes = [0u8; 32]; + let mut b_hi_bytes = [0u8; 32]; + + // Copy lower 16 bytes to b_lo, upper 16 bytes to b_hi + b_lo_bytes[..16].copy_from_slice(&b_bytes[..16]); + b_hi_bytes[..16].copy_from_slice(&b_bytes[16..]); + + let b_lo = Scalar::from_canonical_bytes(b_lo_bytes).unwrap(); + let b_hi = Scalar::from_canonical_bytes(b_hi_bytes).unwrap(); + + // Compute NAF representations (all scalars are now ~128 bits) + let a1_naf = a1.non_adjacent_form(5); + let a2_naf = a2.non_adjacent_form(5); + + #[cfg(feature = "precomputed-tables")] + let b_lo_naf = b_lo.non_adjacent_form(8); + #[cfg(not(feature = "precomputed-tables"))] + let b_lo_naf = b_lo.non_adjacent_form(5); + + let b_hi_naf = b_hi.non_adjacent_form(5); + + // Find starting index - check all NAFs up to bit 127 + // (with potential carry to bit 128 or 129) + let mut i: usize = HEEA_MAX_INDEX; + for j in (0..=HEEA_MAX_INDEX).rev() { + i = j; + if a1_naf[i] != 0 || a2_naf[i] != 0 || b_lo_naf[i] != 0 || b_hi_naf[i] != 0 { + break; + } + } + + // Create lookup tables using SIMD-optimized CachedPoint + let table_A1 = NafLookupTable5::::from(A1); + let table_A2 = NafLookupTable5::::from(A2); + + #[cfg(feature = "precomputed-tables")] + let table_B = &BASEPOINT_ODD_LOOKUP_TABLE; + #[cfg(not(feature = "precomputed-tables"))] + let table_B = &NafLookupTable5::::from(&constants::ED25519_BASEPOINT_POINT); + + // B' = B * 2^128 (precomputed constant point) + // TODO: For optimal performance, this should also use the wider lookup table when precomputed-tables is enabled + let table_B_128 = + &NafLookupTable5::::from(&constants::ED25519_BASEPOINT_128_POINT); + + let mut Q = ExtendedPoint::identity(); + + loop { + Q = Q.double(); + + // Add contributions from a1*A1 + match a1_naf[i].cmp(&0) { + Ordering::Greater => { + Q = &Q + &table_A1.select(a1_naf[i] as usize); + } + Ordering::Less => { + Q = &Q - &table_A1.select(-a1_naf[i] as usize); + } + Ordering::Equal => {} + } + + // Add contributions from a2*A2 + match a2_naf[i].cmp(&0) { + Ordering::Greater => { + Q = &Q + &table_A2.select(a2_naf[i] as usize); + } + Ordering::Less => { + Q = &Q - &table_A2.select(-a2_naf[i] as usize); + } + Ordering::Equal => {} + } + + // Add contributions from b_lo*B + match b_lo_naf[i].cmp(&0) { + Ordering::Greater => { + Q = &Q + &table_B.select(b_lo_naf[i] as usize); + } + Ordering::Less => { + Q = &Q - &table_B.select(-b_lo_naf[i] as usize); + } + Ordering::Equal => {} + } + + // Add contributions from b_hi*B' where B' = B * 2^128 + match b_hi_naf[i].cmp(&0) { + Ordering::Greater => { + Q = &Q + &table_B_128.select(b_hi_naf[i] as usize); + } + Ordering::Less => { + Q = &Q - &table_B_128.select(-b_hi_naf[i] as usize); + } + Ordering::Equal => {} + } + + if i == 0 { + break; + } + i -= 1; + } + + Q.into() + } +} diff --git a/curve25519-dalek/src/edwards.rs b/curve25519-dalek/src/edwards.rs index 44a15ba37..74347976f 100644 --- a/curve25519-dalek/src/edwards.rs +++ b/curve25519-dalek/src/edwards.rs @@ -1072,6 +1072,38 @@ impl EdwardsPoint { ) -> EdwardsPoint { crate::backend::vartime_double_base_mul(a, A, b) } + + /// Compute \\(a_1 A_1 + a_2 A_2 + b B\\) in variable time, where \\(B\\) is the Ed25519 basepoint. + /// + /// This function is optimized for the case where \\(a_1\\) and \\(a_2\\) are less than \\(2^{128}\\). + /// + /// # Example + /// + /// ``` + /// use curve25519_dalek::scalar::Scalar; + /// use curve25519_dalek::constants::ED25519_BASEPOINT_POINT; + /// use curve25519_dalek::edwards::EdwardsPoint; + /// + /// let a1 = Scalar::from(123u64); + /// let a2 = Scalar::from(456u64); + /// let b = Scalar::from(789u64); + /// + /// let A1 = &ED25519_BASEPOINT_POINT * &Scalar::from(2u64); + /// let A2 = &ED25519_BASEPOINT_POINT * &Scalar::from(3u64); + /// + /// // Compute a1*A1 + a2*A2 + b*B efficiently + /// let result = EdwardsPoint::vartime_triple_scalar_mul_basepoint(&a1, &A1, &a2, &A2, &b); + /// ``` + #[allow(non_snake_case)] + pub fn vartime_triple_scalar_mul_basepoint( + a1: &Scalar, + A1: &EdwardsPoint, + a2: &Scalar, + A2: &EdwardsPoint, + b: &Scalar, + ) -> EdwardsPoint { + crate::backend::vartime_triple_base_mul_128_128_256(a1, A1, a2, A2, b) + } } #[cfg(feature = "precomputed-tables")] diff --git a/curve25519-dalek/src/scalar.rs b/curve25519-dalek/src/scalar.rs index 513ad6b76..b67511255 100644 --- a/curve25519-dalek/src/scalar.rs +++ b/curve25519-dalek/src/scalar.rs @@ -149,6 +149,8 @@ use zeroize::Zeroize; use crate::backend; use crate::constants; +pub(crate) const HEEA_MAX_INDEX: usize = 129; + cfg_if! { if #[cfg(curve25519_dalek_backend = "fiat")] { /// An `UnpackedScalar` represents an element of the field GF(l), optimized for speed.