diff --git a/Cargo.lock b/Cargo.lock index 051f403..edf48ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -402,7 +402,7 @@ checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" [[package]] name = "rlnc" -version = "0.8.6" +version = "0.8.7" dependencies = [ "criterion", "rand", diff --git a/Cargo.toml b/Cargo.toml index 181a138..ac5006d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rlnc" -version = "0.8.6" +version = "0.8.7" edition = "2024" resolver = "3" rust-version = "1.89.0" diff --git a/README.md b/README.md index e3addd1..8d3c8de 100644 --- a/README.md +++ b/README.md @@ -157,17 +157,7 @@ For visualizing benchmark results, run following command, which will produce PNG make bench_then_plot # Only runs with `default` features enabled ``` -### On 12th Gen Intel(R) Core(TM) i7-1260P - -Running benchmarks on `Linux 6.14.0-27-generic x86_64`, compiled with `rustc 1.88.0 (6b00bc388 2025-06-23)`. - -Component | Peak Median Throughput (`default` feature) | Peak Median Throughput (`parallel` feature) | Impact of number of pieces on performance ---- | --- | --- | --- -Full RLNC Encoder | **30.14 GiB/s** | **23.39 GiB/s** | The number of pieces original data got split into has a **minimal** impact on the encoding speed. -Full RLNC Recoder | **27.26 GiB/s** | **12.63 GiB/s** | Similar to the encoder, the recoder's performance remains largely consistent regardless of how many pieces the original data is split into. -Full RLNC Decoder | **1.59 GiB/s** | **Doesn't yet implement a parallel decoding mode** | As the number of pieces increases, the decoding time increases substantially, leading to a considerable drop in throughput. This indicates that decoding is the most computationally intensive part of the full RLNC scheme, and its performance is inversely proportional to the number of pieces. - -In summary, the full RLNC implementation demonstrates excellent encoding and recoding speeds, consistently achieving GiB/s throughputs with minimal sensitivity to the number of data pieces. The `parallel` feature, leveraging Rust `rayon` data-parallelism framework, also provides good performance for both encoding and recoding. Whether you want to use that feature, completely depends on your usecase. However, decoding remains a much slower operation, with its performance significantly diminishing as the data is split into a greater number of pieces, and currently does **not** implement a parallel decoding algorithm. +More performance benchmarking results are displayed on README inside [./plots](./plots) directory. ## Usage @@ -175,9 +165,9 @@ To use `rlnc` library crate in your Rust project, add it as a dependency in your ```toml [dependencies] -rlnc = "=0.8.6" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. +rlnc = "=0.8.7" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. # or -rlnc = { version = "=0.8.6", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding. +rlnc = { version = "=0.8.7", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding. rand = { version = "=0.9.2" } # Required for random number generation ``` diff --git a/plots/README.md b/plots/README.md new file mode 100644 index 0000000..99ca55d --- /dev/null +++ b/plots/README.md @@ -0,0 +1,98 @@ +# Plotted Performance Benchmark Results + +Following plots are generated by running this make recipe, from root of the repository, on machines with specified configuration. + +> [!NOTE] +> These benchmark results don't capture the performance of running RLNC encoder, recoder and decoder with `parallel` feature. + +```bash +make bench_then_plot +``` + +## Performance Characteristics + +Algorithm | Characteristics +--- | --- +Encoding | The number of pieces original data got split into has a **minimal** impact on the encoding speed. +Recoding | Recoding is a wrapper over encoding, with an additional matrix-vector multiplication. If number of pieces increases, the dimension of matrix-vector multiplication also increases, resulting in higher computational complexity, during Recoding. +Decoding | As the number of pieces increases, the decoding time increases substantially, leading to a considerable drop in throughput. This indicates that decoding is the most computationally intensive part of the full RLNC scheme, and its performance is inversely proportional to the number of pieces. + +In summary, this RLNC implementation demonstrates excellent encoding and recoding speeds, consistently achieving GiB/s throughputs with relatively minimal sensitivity to the number of data pieces. The `parallel` feature, leveraging Rust `rayon` data-parallelism framework, also provides good performance for both encoding and recoding. Whether you want to use that feature, completely depends on your use case. However, decoding remains a much slower operation, with its performance significantly diminishing as the data is split into a greater number of pieces, and currently does **not** implement a parallel decoding algorithm. + +## On 12th Gen Intel(R) Core(TM) i7-1260P + +Running Linux kernel + +```bash +$ uname -srm +Linux 6.17.0-5-generic x86_64 +``` + +with Rust compiler + +```bash +$ rustc --version +rustc 1.90.0 (1159e78c4 2025-09-14) +``` + +and following CPU feature flags + +```bash +$ lscpu | awk -F': *' '/Flags/{gsub(" ", ", ", $2);print $2}' +fpu, vme, de, pse, tsc, msr, pae, mce, cx8, apic, sep, mtrr, pge, mca, cmov, pat, pse36, clflush, dts, acpi, mmx, fxsr, sse, sse2, ss, ht, tm, pbe, syscall, nx, pdpe1gb, rdtscp, lm, constant_tsc, art, arch_perfmon, pebs, bts, rep_good, nopl, xtopology, nonstop_tsc, cpuid, aperfmperf, tsc_known_freq, pni, pclmulqdq, dtes64, monitor, ds_cpl, vmx, smx, est, tm2, ssse3, sdbg, fma, cx16, xtpr, pdcm, pcid, sse4_1, sse4_2, x2apic, movbe, popcnt, tsc_deadline_timer, aes, xsave, avx, f16c, rdrand, lahf_lm, abm, 3dnowprefetch, cpuid_fault, epb, ssbd, ibrs, ibpb, stibp, ibrs_enhanced, tpr_shadow, flexpriority, ept, vpid, ept_ad, fsgsbase, tsc_adjust, bmi1, avx2, smep, bmi2, erms, invpcid, rdseed, adx, smap, clflushopt, clwb, intel_pt, sha_ni, xsaveopt, xsavec, xgetbv1, xsaves, split_lock_detect, user_shstk, avx_vnni, dtherm, ida, arat, pln, pts, hwp, hwp_notify, hwp_act_window, hwp_epp, hwp_pkg_req, hfi, vnmi, umip, pku, ospke, waitpkg, gfni, vaes, vpclmulqdq, rdpid, movdiri, movdir64b, fsrm, md_clear, serialize, arch_lbr, ibt, flush_l1d, arch_capabilities +``` + +### Encoder + +![benchmark_encode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_encode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +![benchmark_encode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_encode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +### Recoder + +![benchmark_recode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_recode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +![benchmark_recode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_recode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +### Decoder + +![benchmark_decode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_decode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +## On AMD EPYC 9R14 (AWS EC2 `m7a.large`) + +Running Linux kernel + +```bash +$ uname -srm +Linux 6.14.0-1011-aws x86_64 +``` + +with Rust compiler + +```bash +$ rustc --version +rustc 1.90.0 (1159e78c4 2025-09-14) +``` + +and following CPU feature flags + +```bash +$ lscpu | awk -F': *' '/Flags/{gsub(" ", ", ", $2);print $2}' +fpu, vme, de, pse, tsc, msr, pae, mce, cx8, apic, sep, mtrr, pge, mca, cmov, pat, pse36, clflush, dts, acpi, mmx, fxsr, sse, sse2, ss, ht, tm, pbe, syscall, nx, pdpe1gb, rdtscp, lm, constant_tsc, art, arch_perfmon, pebs, bts, rep_good, nopl, xtopology, nonstop_tsc, cpuid, aperfmperf, tsc_known_freq, pni, pclmulqdq, dtes64, monitor, ds_cpl, vmx, smx, est, tm2, ssse3, sdbg, fma, cx16, xtpr, pdcm, pcid, sse4_1, sse4_2, x2apic, movbe, popcnt, tsc_deadline_timer, aes, xsave, avx, f16c, rdrand, lahf_lm, abm, 3dnowprefetch, cpuid_fault, epb, ssbd, ibrs, ibpb, stibp, ibrs_enhanced, tpr_shadow, flexpriority, ept, vpid, ept_ad, fsgsbase, tsc_adjust, bmi1, avx2, smep, bmi2, erms, invpcid, rdseed, adx, smap, clflushopt, clwb, intel_pt, sha_ni, xsaveopt, xsavec, xgetbv1, xsaves, split_lock_detect, user_shstk, avx_vnni, dtherm, ida, arat, pln, pts, hwp, hwp_notify, hwp_act_window, hwp_epp, hwp_pkg_req, hfi, vnmi, umip, pku, ospke, waitpkg, gfni, vaes, vpclmulqdq, rdpid, movdiri, movdir64b, fsrm, md_clear, serialize, arch_lbr, ibt, flush_l1d, arch_capabilities +``` + +### Encoder + +![benchmark_encode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_encode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +![benchmark_encode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_encode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +### Recoder + +![benchmark_recode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_recode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +![benchmark_recode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_recode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png) + +### Decoder + +![benchmark_decode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14](./benchmark_decode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png) diff --git a/plots/benchmark_decode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_decode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..bba18f8 Binary files /dev/null and b/plots/benchmark_decode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_decode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_decode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png index 4605cbd..200a2d3 100644 Binary files a/plots/benchmark_decode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png and b/plots/benchmark_decode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_encode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_encode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..3b5a008 Binary files /dev/null and b/plots/benchmark_encode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_encode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_encode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png index 3be89d8..4c5f5a1 100644 Binary files a/plots/benchmark_encode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png and b/plots/benchmark_encode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_encode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_encode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..f5fc7c2 Binary files /dev/null and b/plots/benchmark_encode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_encode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_encode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..ecdf9fe Binary files /dev/null and b/plots/benchmark_encode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_recode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_recode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..fbb5e7a Binary files /dev/null and b/plots/benchmark_recode_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_recode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_recode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png index 69392b0..6d2620a 100644 Binary files a/plots/benchmark_recode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png and b/plots/benchmark_recode_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_recode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_recode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..6f72427 Binary files /dev/null and b/plots/benchmark_recode_zero_alloc_on_12th_Gen_IntelR_CoreTM_i7-1260P_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/plots/benchmark_recode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png b/plots/benchmark_recode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png new file mode 100644 index 0000000..3ed37a3 Binary files /dev/null and b/plots/benchmark_recode_zero_alloc_on_AMD_EPYC_9R14_with_rustc_1.90.0_1159e78c4_2025-09-14.png differ diff --git a/src/common/simd/x86/avx2.rs b/src/common/simd/x86/avx2.rs index 2212c5b..e963e01 100644 --- a/src/common/simd/x86/avx2.rs +++ b/src/common/simd/x86/avx2.rs @@ -11,7 +11,7 @@ use std::arch::x86_64::*; #[target_feature(enable = "avx2")] pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { - let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER); + let mut iter = vec.chunks_exact_mut(4 * 2 * GF256_HALF_ORDER); unsafe { let l_tbl = _mm256_broadcastsi128_si256(_mm_lddqu_si128(GF256_SIMD_MUL_TABLE_LOW[scalar as usize].as_ptr().cast())); @@ -19,17 +19,48 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { let l_mask = _mm256_set1_epi8(0x0f); for chunk in iter.by_ref() { - let chunk_simd = _mm256_lddqu_si256(chunk.as_ptr().cast()); - - let chunk_simd_lo = _mm256_and_si256(chunk_simd, l_mask); - let chunk_simd_lo = _mm256_shuffle_epi8(l_tbl, chunk_simd_lo); - - let chunk_simd_hi = _mm256_srli_epi64(chunk_simd, 4); - let chunk_simd_hi = _mm256_and_si256(chunk_simd_hi, l_mask); - let chunk_simd_hi = _mm256_shuffle_epi8(h_tbl, chunk_simd_hi); - - let res = _mm256_xor_si256(chunk_simd_lo, chunk_simd_hi); - _mm256_storeu_si256(chunk.as_mut_ptr().cast(), res); + let (chunk0, chunk1, chunk2, chunk3) = { + let (chunk0, rest) = chunk.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_simd = _mm256_lddqu_si256(chunk0.as_ptr().cast()); + let chunk1_simd = _mm256_lddqu_si256(chunk1.as_ptr().cast()); + let chunk2_simd = _mm256_lddqu_si256(chunk2.as_ptr().cast()); + let chunk3_simd = _mm256_lddqu_si256(chunk3.as_ptr().cast()); + + let chunk0_simd_lo = _mm256_and_si256(chunk0_simd, l_mask); + let chunk1_simd_lo = _mm256_and_si256(chunk1_simd, l_mask); + let chunk2_simd_lo = _mm256_and_si256(chunk2_simd, l_mask); + let chunk3_simd_lo = _mm256_and_si256(chunk3_simd, l_mask); + + let chunk0_simd_lo = _mm256_shuffle_epi8(l_tbl, chunk0_simd_lo); + let chunk1_simd_lo = _mm256_shuffle_epi8(l_tbl, chunk1_simd_lo); + let chunk2_simd_lo = _mm256_shuffle_epi8(l_tbl, chunk2_simd_lo); + let chunk3_simd_lo = _mm256_shuffle_epi8(l_tbl, chunk3_simd_lo); + + let chunk0_simd_hi = _mm256_and_si256(_mm256_srli_epi64(chunk0_simd, 4), l_mask); + let chunk1_simd_hi = _mm256_and_si256(_mm256_srli_epi64(chunk1_simd, 4), l_mask); + let chunk2_simd_hi = _mm256_and_si256(_mm256_srli_epi64(chunk2_simd, 4), l_mask); + let chunk3_simd_hi = _mm256_and_si256(_mm256_srli_epi64(chunk3_simd, 4), l_mask); + + let chunk0_simd_hi = _mm256_shuffle_epi8(h_tbl, chunk0_simd_hi); + let chunk1_simd_hi = _mm256_shuffle_epi8(h_tbl, chunk1_simd_hi); + let chunk2_simd_hi = _mm256_shuffle_epi8(h_tbl, chunk2_simd_hi); + let chunk3_simd_hi = _mm256_shuffle_epi8(h_tbl, chunk3_simd_hi); + + let res0 = _mm256_xor_si256(chunk0_simd_lo, chunk0_simd_hi); + let res1 = _mm256_xor_si256(chunk1_simd_lo, chunk1_simd_hi); + let res2 = _mm256_xor_si256(chunk2_simd_lo, chunk2_simd_hi); + let res3 = _mm256_xor_si256(chunk3_simd_lo, chunk3_simd_hi); + + _mm256_storeu_si256(chunk0.as_mut_ptr().cast(), res0); + _mm256_storeu_si256(chunk1.as_mut_ptr().cast(), res1); + _mm256_storeu_si256(chunk2.as_mut_ptr().cast(), res2); + _mm256_storeu_si256(chunk3.as_mut_ptr().cast(), res3); } } @@ -40,31 +71,58 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { #[target_feature(enable = "avx2")] pub unsafe fn add_vec_into(vec_dst: &mut [u8], vec_src: &[u8]) { - let mut iter_dst = vec_dst.chunks_exact_mut(2 * GF256_HALF_ORDER); - let mut iter_src = vec_src.chunks_exact(2 * GF256_HALF_ORDER); + let mut iter_dst = vec_dst.chunks_exact_mut(4 * 2 * GF256_HALF_ORDER); + let mut iter_src = vec_src.chunks_exact(4 * 2 * GF256_HALF_ORDER); unsafe { for (chunk_dst, chunk_src) in iter_dst.by_ref().zip(iter_src.by_ref()) { - let chunk_dst_simd = _mm256_lddqu_si256(chunk_dst.as_ptr().cast()); - let chunk_src_simd = _mm256_lddqu_si256(chunk_src.as_ptr().cast()); - let chunk_result = _mm256_xor_si256(chunk_dst_simd, chunk_src_simd); - - _mm256_storeu_si256(chunk_dst.as_mut_ptr().cast(), chunk_result); + let (chunk0_dst, chunk1_dst, chunk2_dst, chunk3_dst) = { + let (chunk0, rest) = chunk_dst.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let (chunk0_src, chunk1_src, chunk2_src, chunk3_src) = { + let (chunk0, rest) = chunk_src.split_at_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(2 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_dst_simd = _mm256_lddqu_si256(chunk0_dst.as_ptr().cast()); + let chunk0_src_simd = _mm256_lddqu_si256(chunk0_src.as_ptr().cast()); + let chunk0_result = _mm256_xor_si256(chunk0_dst_simd, chunk0_src_simd); + _mm256_storeu_si256(chunk0_dst.as_mut_ptr().cast(), chunk0_result); + + let chunk1_dst_simd = _mm256_lddqu_si256(chunk1_dst.as_ptr().cast()); + let chunk1_src_simd = _mm256_lddqu_si256(chunk1_src.as_ptr().cast()); + let chunk1_result = _mm256_xor_si256(chunk1_dst_simd, chunk1_src_simd); + _mm256_storeu_si256(chunk1_dst.as_mut_ptr().cast(), chunk1_result); + + let chunk2_dst_simd = _mm256_lddqu_si256(chunk2_dst.as_ptr().cast()); + let chunk2_src_simd = _mm256_lddqu_si256(chunk2_src.as_ptr().cast()); + let chunk2_result = _mm256_xor_si256(chunk2_dst_simd, chunk2_src_simd); + _mm256_storeu_si256(chunk2_dst.as_mut_ptr().cast(), chunk2_result); + + let chunk3_dst_simd = _mm256_lddqu_si256(chunk3_dst.as_ptr().cast()); + let chunk3_src_simd = _mm256_lddqu_si256(chunk3_src.as_ptr().cast()); + let chunk3_result = _mm256_xor_si256(chunk3_dst_simd, chunk3_src_simd); + _mm256_storeu_si256(chunk3_dst.as_mut_ptr().cast(), chunk3_result); } } - let remainder_dst = iter_dst.into_remainder(); - let remainder_src = iter_src.remainder(); - - remainder_dst.iter_mut().zip(remainder_src).for_each(|(a, b)| { + iter_dst.into_remainder().iter_mut().zip(iter_src.remainder()).for_each(|(a, b)| { *a ^= b; }); } #[target_feature(enable = "avx2")] pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { - let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER); - let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER); + let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * 2 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(4 * 2 * GF256_HALF_ORDER); unsafe { let l_tbl = _mm256_broadcastsi128_si256(_mm_lddqu_si128(GF256_SIMD_MUL_TABLE_LOW[scalar as usize].as_ptr().cast())); @@ -72,21 +130,66 @@ pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: let l_mask = _mm256_set1_epi8(0x0f); for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { - let mul_vec_chunk_simd = _mm256_lddqu_si256(mul_vec_chunk.as_ptr().cast()); - - let chunk_simd_lo = _mm256_and_si256(mul_vec_chunk_simd, l_mask); - let chunk_simd_lo = _mm256_shuffle_epi8(l_tbl, chunk_simd_lo); - - let chunk_simd_hi = _mm256_srli_epi64(mul_vec_chunk_simd, 4); - let chunk_simd_hi = _mm256_and_si256(chunk_simd_hi, l_mask); - let chunk_simd_hi = _mm256_shuffle_epi8(h_tbl, chunk_simd_hi); - - let scaled_res = _mm256_xor_si256(chunk_simd_lo, chunk_simd_hi); - - let add_vec_chunk_simd = _mm256_lddqu_si256(add_vec_chunk.as_ptr().cast()); - let accum_res = _mm256_xor_si256(add_vec_chunk_simd, scaled_res); - - _mm256_storeu_si256(add_vec_chunk.as_mut_ptr().cast(), accum_res); + let (mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = { + let (chunk0, rest) = mul_vec_chunk.split_at_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(2 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let mul_vec_chunk0_simd = _mm256_lddqu_si256(mul_vec_chunk0.as_ptr().cast()); + let mul_vec_chunk1_simd = _mm256_lddqu_si256(mul_vec_chunk1.as_ptr().cast()); + let mul_vec_chunk2_simd = _mm256_lddqu_si256(mul_vec_chunk2.as_ptr().cast()); + let mul_vec_chunk3_simd = _mm256_lddqu_si256(mul_vec_chunk3.as_ptr().cast()); + + let chunk_simd_lo0 = _mm256_and_si256(mul_vec_chunk0_simd, l_mask); + let chunk_simd_lo1 = _mm256_and_si256(mul_vec_chunk1_simd, l_mask); + let chunk_simd_lo2 = _mm256_and_si256(mul_vec_chunk2_simd, l_mask); + let chunk_simd_lo3 = _mm256_and_si256(mul_vec_chunk3_simd, l_mask); + + let chunk_simd_hi0 = _mm256_and_si256(_mm256_srli_epi64(mul_vec_chunk0_simd, 4), l_mask); + let chunk_simd_hi1 = _mm256_and_si256(_mm256_srli_epi64(mul_vec_chunk1_simd, 4), l_mask); + let chunk_simd_hi2 = _mm256_and_si256(_mm256_srli_epi64(mul_vec_chunk2_simd, 4), l_mask); + let chunk_simd_hi3 = _mm256_and_si256(_mm256_srli_epi64(mul_vec_chunk3_simd, 4), l_mask); + + let chunk_simd_lo0 = _mm256_shuffle_epi8(l_tbl, chunk_simd_lo0); + let chunk_simd_lo1 = _mm256_shuffle_epi8(l_tbl, chunk_simd_lo1); + let chunk_simd_lo2 = _mm256_shuffle_epi8(l_tbl, chunk_simd_lo2); + let chunk_simd_lo3 = _mm256_shuffle_epi8(l_tbl, chunk_simd_lo3); + + let chunk_simd_hi0 = _mm256_shuffle_epi8(h_tbl, chunk_simd_hi0); + let chunk_simd_hi1 = _mm256_shuffle_epi8(h_tbl, chunk_simd_hi1); + let chunk_simd_hi2 = _mm256_shuffle_epi8(h_tbl, chunk_simd_hi2); + let chunk_simd_hi3 = _mm256_shuffle_epi8(h_tbl, chunk_simd_hi3); + + let scaled_res0 = _mm256_xor_si256(chunk_simd_lo0, chunk_simd_hi0); + let scaled_res1 = _mm256_xor_si256(chunk_simd_lo1, chunk_simd_hi1); + let scaled_res2 = _mm256_xor_si256(chunk_simd_lo2, chunk_simd_hi2); + let scaled_res3 = _mm256_xor_si256(chunk_simd_lo3, chunk_simd_hi3); + + let (add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = { + let (chunk0, rest) = add_vec_chunk.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let add_vec_chunk0_simd = _mm256_lddqu_si256(add_vec_chunk0.as_ptr().cast()); + let add_vec_chunk1_simd = _mm256_lddqu_si256(add_vec_chunk1.as_ptr().cast()); + let add_vec_chunk2_simd = _mm256_lddqu_si256(add_vec_chunk2.as_ptr().cast()); + let add_vec_chunk3_simd = _mm256_lddqu_si256(add_vec_chunk3.as_ptr().cast()); + + let accum_res0 = _mm256_xor_si256(add_vec_chunk0_simd, scaled_res0); + let accum_res1 = _mm256_xor_si256(add_vec_chunk1_simd, scaled_res1); + let accum_res2 = _mm256_xor_si256(add_vec_chunk2_simd, scaled_res2); + let accum_res3 = _mm256_xor_si256(add_vec_chunk3_simd, scaled_res3); + + _mm256_storeu_si256(add_vec_chunk0.as_mut_ptr().cast(), accum_res0); + _mm256_storeu_si256(add_vec_chunk1.as_mut_ptr().cast(), accum_res1); + _mm256_storeu_si256(add_vec_chunk2.as_mut_ptr().cast(), accum_res2); + _mm256_storeu_si256(add_vec_chunk3.as_mut_ptr().cast(), accum_res3); } } diff --git a/src/common/simd/x86/avx512.rs b/src/common/simd/x86/avx512.rs index b544ced..38b26ed 100644 --- a/src/common/simd/x86/avx512.rs +++ b/src/common/simd/x86/avx512.rs @@ -11,7 +11,7 @@ use std::arch::x86_64::*; #[target_feature(enable = "avx512bw")] pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { - let mut iter = vec.chunks_exact_mut(4 * GF256_HALF_ORDER); + let mut iter = vec.chunks_exact_mut(4 * 4 * GF256_HALF_ORDER); unsafe { let l_tbl = _mm512_broadcast_i32x4(_mm_lddqu_si128(GF256_SIMD_MUL_TABLE_LOW[scalar as usize].as_ptr().cast())); @@ -19,17 +19,53 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { let l_mask = _mm512_set1_epi8(0x0f); for chunk in iter.by_ref() { - let chunk_simd = _mm512_loadu_si512(chunk.as_ptr().cast()); - - let chunk_simd_lo = _mm512_and_si512(chunk_simd, l_mask); - let chunk_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk_simd_lo); - - let chunk_simd_hi = _mm512_srli_epi64(chunk_simd, 4); - let chunk_simd_hi = _mm512_and_si512(chunk_simd_hi, l_mask); - let chunk_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk_simd_hi); - - let res = _mm512_xor_si512(chunk_simd_lo, chunk_simd_hi); - _mm512_storeu_si512(chunk.as_mut_ptr().cast(), res); + let (chunk0, chunk1, chunk2, chunk3) = { + let (chunk0, rest) = chunk.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_simd = _mm512_loadu_si512(chunk0.as_ptr().cast()); + let chunk1_simd = _mm512_loadu_si512(chunk1.as_ptr().cast()); + let chunk2_simd = _mm512_loadu_si512(chunk2.as_ptr().cast()); + let chunk3_simd = _mm512_loadu_si512(chunk3.as_ptr().cast()); + + let chunk0_simd_lo = _mm512_and_si512(chunk0_simd, l_mask); + let chunk1_simd_lo = _mm512_and_si512(chunk1_simd, l_mask); + let chunk2_simd_lo = _mm512_and_si512(chunk2_simd, l_mask); + let chunk3_simd_lo = _mm512_and_si512(chunk3_simd, l_mask); + + let chunk0_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk0_simd_lo); + let chunk1_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk1_simd_lo); + let chunk2_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk2_simd_lo); + let chunk3_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk3_simd_lo); + + let chunk0_simd_hi = _mm512_srli_epi64(chunk0_simd, 4); + let chunk1_simd_hi = _mm512_srli_epi64(chunk1_simd, 4); + let chunk2_simd_hi = _mm512_srli_epi64(chunk2_simd, 4); + let chunk3_simd_hi = _mm512_srli_epi64(chunk3_simd, 4); + + let chunk0_simd_hi = _mm512_and_si512(chunk0_simd_hi, l_mask); + let chunk1_simd_hi = _mm512_and_si512(chunk1_simd_hi, l_mask); + let chunk2_simd_hi = _mm512_and_si512(chunk2_simd_hi, l_mask); + let chunk3_simd_hi = _mm512_and_si512(chunk3_simd_hi, l_mask); + + let chunk0_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk0_simd_hi); + let chunk1_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk1_simd_hi); + let chunk2_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk2_simd_hi); + let chunk3_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk3_simd_hi); + + let res0 = _mm512_xor_si512(chunk0_simd_lo, chunk0_simd_hi); + let res1 = _mm512_xor_si512(chunk1_simd_lo, chunk1_simd_hi); + let res2 = _mm512_xor_si512(chunk2_simd_lo, chunk2_simd_hi); + let res3 = _mm512_xor_si512(chunk3_simd_lo, chunk3_simd_hi); + + _mm512_storeu_si512(chunk0.as_mut_ptr().cast(), res0); + _mm512_storeu_si512(chunk1.as_mut_ptr().cast(), res1); + _mm512_storeu_si512(chunk2.as_mut_ptr().cast(), res2); + _mm512_storeu_si512(chunk3.as_mut_ptr().cast(), res3); } } @@ -40,31 +76,58 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { #[target_feature(enable = "avx512f")] pub unsafe fn add_vec_into(vec_dst: &mut [u8], vec_src: &[u8]) { - let mut iter_dst = vec_dst.chunks_exact_mut(4 * GF256_HALF_ORDER); - let mut iter_src = vec_src.chunks_exact(4 * GF256_HALF_ORDER); + let mut iter_dst = vec_dst.chunks_exact_mut(4 * 4 * GF256_HALF_ORDER); + let mut iter_src = vec_src.chunks_exact(4 * 4 * GF256_HALF_ORDER); unsafe { for (chunk_dst, chunk_src) in iter_dst.by_ref().zip(iter_src.by_ref()) { - let chunk_dst_simd = _mm512_loadu_si512(chunk_dst.as_ptr().cast()); - let chunk_src_simd = _mm512_loadu_si512(chunk_src.as_ptr().cast()); - let chunk_result = _mm512_xor_si512(chunk_dst_simd, chunk_src_simd); - - _mm512_storeu_si512(chunk_dst.as_mut_ptr().cast(), chunk_result); + let (chunk0_dst, chunk1_dst, chunk2_dst, chunk3_dst) = { + let (chunk0, rest) = chunk_dst.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let (chunk0_src, chunk1_src, chunk2_src, chunk3_src) = { + let (chunk0, rest) = chunk_src.split_at_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_dst_simd = _mm512_loadu_si512(chunk0_dst.as_ptr().cast()); + let chunk0_src_simd = _mm512_loadu_si512(chunk0_src.as_ptr().cast()); + let chunk0_result = _mm512_xor_si512(chunk0_dst_simd, chunk0_src_simd); + _mm512_storeu_si512(chunk0_dst.as_mut_ptr().cast(), chunk0_result); + + let chunk1_dst_simd = _mm512_loadu_si512(chunk1_dst.as_ptr().cast()); + let chunk1_src_simd = _mm512_loadu_si512(chunk1_src.as_ptr().cast()); + let chunk1_result = _mm512_xor_si512(chunk1_dst_simd, chunk1_src_simd); + _mm512_storeu_si512(chunk1_dst.as_mut_ptr().cast(), chunk1_result); + + let chunk2_dst_simd = _mm512_loadu_si512(chunk2_dst.as_ptr().cast()); + let chunk2_src_simd = _mm512_loadu_si512(chunk2_src.as_ptr().cast()); + let chunk2_result = _mm512_xor_si512(chunk2_dst_simd, chunk2_src_simd); + _mm512_storeu_si512(chunk2_dst.as_mut_ptr().cast(), chunk2_result); + + let chunk3_dst_simd = _mm512_loadu_si512(chunk3_dst.as_ptr().cast()); + let chunk3_src_simd = _mm512_loadu_si512(chunk3_src.as_ptr().cast()); + let chunk3_result = _mm512_xor_si512(chunk3_dst_simd, chunk3_src_simd); + _mm512_storeu_si512(chunk3_dst.as_mut_ptr().cast(), chunk3_result); } } - let remainder_dst = iter_dst.into_remainder(); - let remainder_src = iter_src.remainder(); - - remainder_dst.iter_mut().zip(remainder_src).for_each(|(a, b)| { + iter_dst.into_remainder().iter_mut().zip(iter_src.remainder()).for_each(|(a, b)| { *a ^= b; }); } #[target_feature(enable = "avx512bw")] pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { - let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * GF256_HALF_ORDER); - let mut mul_vec_iter = mul_vec.chunks_exact(4 * GF256_HALF_ORDER); + let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * 4 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(4 * 4 * GF256_HALF_ORDER); unsafe { let l_tbl = _mm512_broadcast_i32x4(_mm_lddqu_si128(GF256_SIMD_MUL_TABLE_LOW[scalar as usize].as_ptr().cast())); @@ -72,21 +135,71 @@ pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: let l_mask = _mm512_set1_epi8(0x0f); for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { - let mul_vec_chunk_simd = _mm512_loadu_si512(mul_vec_chunk.as_ptr().cast()); - - let chunk_simd_lo = _mm512_and_si512(mul_vec_chunk_simd, l_mask); - let chunk_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk_simd_lo); - - let chunk_simd_hi = _mm512_srli_epi64(mul_vec_chunk_simd, 4); - let chunk_simd_hi = _mm512_and_si512(chunk_simd_hi, l_mask); - let chunk_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk_simd_hi); - - let scaled_res = _mm512_xor_si512(chunk_simd_lo, chunk_simd_hi); - - let add_vec_chunk_simd = _mm512_loadu_si512(add_vec_chunk.as_ptr().cast()); - let accum_res = _mm512_xor_si512(add_vec_chunk_simd, scaled_res); - - _mm512_storeu_si512(add_vec_chunk.as_mut_ptr().cast(), accum_res); + let (mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = { + let (chunk0, rest) = mul_vec_chunk.split_at_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let mul_vec_chunk0_simd = _mm512_loadu_si512(mul_vec_chunk0.as_ptr().cast()); + let mul_vec_chunk1_simd = _mm512_loadu_si512(mul_vec_chunk1.as_ptr().cast()); + let mul_vec_chunk2_simd = _mm512_loadu_si512(mul_vec_chunk2.as_ptr().cast()); + let mul_vec_chunk3_simd = _mm512_loadu_si512(mul_vec_chunk3.as_ptr().cast()); + + let chunk0_simd_lo = _mm512_and_si512(mul_vec_chunk0_simd, l_mask); + let chunk1_simd_lo = _mm512_and_si512(mul_vec_chunk1_simd, l_mask); + let chunk2_simd_lo = _mm512_and_si512(mul_vec_chunk2_simd, l_mask); + let chunk3_simd_lo = _mm512_and_si512(mul_vec_chunk3_simd, l_mask); + + let chunk0_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk0_simd_lo); + let chunk1_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk1_simd_lo); + let chunk2_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk2_simd_lo); + let chunk3_simd_lo = _mm512_shuffle_epi8(l_tbl, chunk3_simd_lo); + + let chunk0_simd_hi = _mm512_srli_epi64(mul_vec_chunk0_simd, 4); + let chunk1_simd_hi = _mm512_srli_epi64(mul_vec_chunk1_simd, 4); + let chunk2_simd_hi = _mm512_srli_epi64(mul_vec_chunk2_simd, 4); + let chunk3_simd_hi = _mm512_srli_epi64(mul_vec_chunk3_simd, 4); + + let chunk0_simd_hi = _mm512_and_si512(chunk0_simd_hi, l_mask); + let chunk1_simd_hi = _mm512_and_si512(chunk1_simd_hi, l_mask); + let chunk2_simd_hi = _mm512_and_si512(chunk2_simd_hi, l_mask); + let chunk3_simd_hi = _mm512_and_si512(chunk3_simd_hi, l_mask); + + let chunk0_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk0_simd_hi); + let chunk1_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk1_simd_hi); + let chunk2_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk2_simd_hi); + let chunk3_simd_hi = _mm512_shuffle_epi8(h_tbl, chunk3_simd_hi); + + let scaled_res0 = _mm512_xor_si512(chunk0_simd_lo, chunk0_simd_hi); + let scaled_res1 = _mm512_xor_si512(chunk1_simd_lo, chunk1_simd_hi); + let scaled_res2 = _mm512_xor_si512(chunk2_simd_lo, chunk2_simd_hi); + let scaled_res3 = _mm512_xor_si512(chunk3_simd_lo, chunk3_simd_hi); + + let (add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = { + let (chunk0, rest) = add_vec_chunk.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let add_vec_chunk0_simd = _mm512_loadu_si512(add_vec_chunk0.as_ptr().cast()); + let add_vec_chunk1_simd = _mm512_loadu_si512(add_vec_chunk1.as_ptr().cast()); + let add_vec_chunk2_simd = _mm512_loadu_si512(add_vec_chunk2.as_ptr().cast()); + let add_vec_chunk3_simd = _mm512_loadu_si512(add_vec_chunk3.as_ptr().cast()); + + let accum_res0 = _mm512_xor_si512(add_vec_chunk0_simd, scaled_res0); + let accum_res1 = _mm512_xor_si512(add_vec_chunk1_simd, scaled_res1); + let accum_res2 = _mm512_xor_si512(add_vec_chunk2_simd, scaled_res2); + let accum_res3 = _mm512_xor_si512(add_vec_chunk3_simd, scaled_res3); + + _mm512_storeu_si512(add_vec_chunk0.as_mut_ptr().cast(), accum_res0); + _mm512_storeu_si512(add_vec_chunk1.as_mut_ptr().cast(), accum_res1); + _mm512_storeu_si512(add_vec_chunk2.as_mut_ptr().cast(), accum_res2); + _mm512_storeu_si512(add_vec_chunk3.as_mut_ptr().cast(), accum_res3); } } diff --git a/src/common/simd/x86/gfni/m128i.rs b/src/common/simd/x86/gfni/m128i.rs index 62283c3..a3a7c8c 100644 --- a/src/common/simd/x86/gfni/m128i.rs +++ b/src/common/simd/x86/gfni/m128i.rs @@ -8,7 +8,7 @@ use std::arch::x86_64::*; #[target_feature(enable = "gfni", enable = "avx512vl")] pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { - let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER); + let mut iter = vec.chunks_exact_mut(GF256_HALF_ORDER); unsafe { let scalar_simd = _mm_set1_epi8(scalar as i8); @@ -27,8 +27,8 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { #[target_feature(enable = "gfni", enable = "avx512vl")] pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { - let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER); - let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER); + let mut add_vec_iter = add_into_vec.chunks_exact_mut(GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(GF256_HALF_ORDER); unsafe { let scalar_simd = _mm_set1_epi8(scalar as i8); diff --git a/src/common/simd/x86/gfni/m256i.rs b/src/common/simd/x86/gfni/m256i.rs index f101bcd..631b186 100644 --- a/src/common/simd/x86/gfni/m256i.rs +++ b/src/common/simd/x86/gfni/m256i.rs @@ -27,20 +27,53 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { #[target_feature(enable = "gfni", enable = "avx512vl")] pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { - let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER); - let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER); + let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * 2 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(4 * 2 * GF256_HALF_ORDER); unsafe { let scalar_simd = _mm256_set1_epi8(scalar as i8); for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { - let mul_vec_chunk_simd = _mm256_loadu_si256(mul_vec_chunk.as_ptr().cast()); - let scaled_res = _mm256_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd); + let (mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = { + let (chunk0, rest) = mul_vec_chunk.split_at_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(2 * GF256_HALF_ORDER); - let add_vec_chunk_simd = _mm256_loadu_si256(add_vec_chunk.as_ptr().cast()); - let accum_res = _mm256_xor_si256(add_vec_chunk_simd, scaled_res); + (chunk0, chunk1, chunk2, chunk3) + }; - _mm256_storeu_si256(add_vec_chunk.as_mut_ptr().cast(), accum_res); + let mul_vec_chunk0_simd = _mm256_loadu_si256(mul_vec_chunk0.as_ptr().cast()); + let mul_vec_chunk1_simd = _mm256_loadu_si256(mul_vec_chunk1.as_ptr().cast()); + let mul_vec_chunk2_simd = _mm256_loadu_si256(mul_vec_chunk2.as_ptr().cast()); + let mul_vec_chunk3_simd = _mm256_loadu_si256(mul_vec_chunk3.as_ptr().cast()); + + let scaled_res0 = _mm256_gf2p8mul_epi8(mul_vec_chunk0_simd, scalar_simd); + let scaled_res1 = _mm256_gf2p8mul_epi8(mul_vec_chunk1_simd, scalar_simd); + let scaled_res2 = _mm256_gf2p8mul_epi8(mul_vec_chunk2_simd, scalar_simd); + let scaled_res3 = _mm256_gf2p8mul_epi8(mul_vec_chunk3_simd, scalar_simd); + + let (add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = { + let (chunk0, rest) = add_vec_chunk.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let add_vec_chunk0_simd = _mm256_loadu_si256(add_vec_chunk0.as_ptr().cast()); + let add_vec_chunk1_simd = _mm256_loadu_si256(add_vec_chunk1.as_ptr().cast()); + let add_vec_chunk2_simd = _mm256_loadu_si256(add_vec_chunk2.as_ptr().cast()); + let add_vec_chunk3_simd = _mm256_loadu_si256(add_vec_chunk3.as_ptr().cast()); + + let accum_res0 = _mm256_xor_si256(add_vec_chunk0_simd, scaled_res0); + let accum_res1 = _mm256_xor_si256(add_vec_chunk1_simd, scaled_res1); + let accum_res2 = _mm256_xor_si256(add_vec_chunk2_simd, scaled_res2); + let accum_res3 = _mm256_xor_si256(add_vec_chunk3_simd, scaled_res3); + + _mm256_storeu_si256(add_vec_chunk0.as_mut_ptr().cast(), accum_res0); + _mm256_storeu_si256(add_vec_chunk1.as_mut_ptr().cast(), accum_res1); + _mm256_storeu_si256(add_vec_chunk2.as_mut_ptr().cast(), accum_res2); + _mm256_storeu_si256(add_vec_chunk3.as_mut_ptr().cast(), accum_res3); } } diff --git a/src/common/simd/x86/gfni/m512i.rs b/src/common/simd/x86/gfni/m512i.rs index 6b9049f..928b067 100644 --- a/src/common/simd/x86/gfni/m512i.rs +++ b/src/common/simd/x86/gfni/m512i.rs @@ -8,15 +8,33 @@ use std::arch::x86_64::*; #[target_feature(enable = "gfni", enable = "avx512f")] pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { - let mut iter = vec.chunks_exact_mut(4 * GF256_HALF_ORDER); + let mut iter = vec.chunks_exact_mut(4 * 4 * GF256_HALF_ORDER); unsafe { let scalar_simd = _mm512_set1_epi8(scalar as i8); for chunk in iter.by_ref() { - let chunk_simd = _mm512_loadu_si512(chunk.as_ptr().cast()); - let res = _mm512_gf2p8mul_epi8(chunk_simd, scalar_simd); + let (chunk0, chunk1, chunk2, chunk3) = { + let (chunk0, rest) = chunk.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); - _mm512_storeu_si512(chunk.as_mut_ptr().cast(), res); + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_simd = _mm512_loadu_si512(chunk0.as_ptr().cast()); + let chunk1_simd = _mm512_loadu_si512(chunk1.as_ptr().cast()); + let chunk2_simd = _mm512_loadu_si512(chunk2.as_ptr().cast()); + let chunk3_simd = _mm512_loadu_si512(chunk3.as_ptr().cast()); + + let res0 = _mm512_gf2p8mul_epi8(chunk0_simd, scalar_simd); + let res1 = _mm512_gf2p8mul_epi8(chunk1_simd, scalar_simd); + let res2 = _mm512_gf2p8mul_epi8(chunk2_simd, scalar_simd); + let res3 = _mm512_gf2p8mul_epi8(chunk3_simd, scalar_simd); + + _mm512_storeu_si512(chunk0.as_mut_ptr().cast(), res0); + _mm512_storeu_si512(chunk1.as_mut_ptr().cast(), res1); + _mm512_storeu_si512(chunk2.as_mut_ptr().cast(), res2); + _mm512_storeu_si512(chunk3.as_mut_ptr().cast(), res3); } } @@ -27,20 +45,53 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { #[target_feature(enable = "gfni", enable = "avx512f")] pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { - let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * GF256_HALF_ORDER); - let mut mul_vec_iter = mul_vec.chunks_exact(4 * GF256_HALF_ORDER); + let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * 4 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(4 * 4 * GF256_HALF_ORDER); unsafe { let scalar_simd = _mm512_set1_epi8(scalar as i8); for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { - let mul_vec_chunk_simd = _mm512_loadu_si512(mul_vec_chunk.as_ptr().cast()); - let scaled_res = _mm512_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd); + let (mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = { + let (chunk0, rest) = mul_vec_chunk.split_at_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let mul_vec_chunk0_simd = _mm512_loadu_si512(mul_vec_chunk0.as_ptr().cast()); + let mul_vec_chunk1_simd = _mm512_loadu_si512(mul_vec_chunk1.as_ptr().cast()); + let mul_vec_chunk2_simd = _mm512_loadu_si512(mul_vec_chunk2.as_ptr().cast()); + let mul_vec_chunk3_simd = _mm512_loadu_si512(mul_vec_chunk3.as_ptr().cast()); + + let scaled_res0 = _mm512_gf2p8mul_epi8(mul_vec_chunk0_simd, scalar_simd); + let scaled_res1 = _mm512_gf2p8mul_epi8(mul_vec_chunk1_simd, scalar_simd); + let scaled_res2 = _mm512_gf2p8mul_epi8(mul_vec_chunk2_simd, scalar_simd); + let scaled_res3 = _mm512_gf2p8mul_epi8(mul_vec_chunk3_simd, scalar_simd); + + let (add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = { + let (chunk0, rest) = add_vec_chunk.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(4 * GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let add_vec_chunk0_simd = _mm512_loadu_si512(add_vec_chunk0.as_ptr().cast()); + let add_vec_chunk1_simd = _mm512_loadu_si512(add_vec_chunk1.as_ptr().cast()); + let add_vec_chunk2_simd = _mm512_loadu_si512(add_vec_chunk2.as_ptr().cast()); + let add_vec_chunk3_simd = _mm512_loadu_si512(add_vec_chunk3.as_ptr().cast()); - let add_vec_chunk_simd = _mm512_loadu_si512(add_vec_chunk.as_ptr().cast()); - let accum_res = _mm512_xor_si512(add_vec_chunk_simd, scaled_res); + let accum_res0 = _mm512_xor_si512(add_vec_chunk0_simd, scaled_res0); + let accum_res1 = _mm512_xor_si512(add_vec_chunk1_simd, scaled_res1); + let accum_res2 = _mm512_xor_si512(add_vec_chunk2_simd, scaled_res2); + let accum_res3 = _mm512_xor_si512(add_vec_chunk3_simd, scaled_res3); - _mm512_storeu_si512(add_vec_chunk.as_mut_ptr().cast(), accum_res); + _mm512_storeu_si512(add_vec_chunk0.as_mut_ptr().cast(), accum_res0); + _mm512_storeu_si512(add_vec_chunk1.as_mut_ptr().cast(), accum_res1); + _mm512_storeu_si512(add_vec_chunk2.as_mut_ptr().cast(), accum_res2); + _mm512_storeu_si512(add_vec_chunk3.as_mut_ptr().cast(), accum_res3); } } diff --git a/src/common/simd/x86/ssse3.rs b/src/common/simd/x86/ssse3.rs index 7ded0d5..c100d78 100644 --- a/src/common/simd/x86/ssse3.rs +++ b/src/common/simd/x86/ssse3.rs @@ -11,7 +11,7 @@ use std::arch::x86_64::*; #[target_feature(enable = "ssse3")] pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { - let mut iter = vec.chunks_exact_mut(GF256_HALF_ORDER); + let mut iter = vec.chunks_exact_mut(4 * GF256_HALF_ORDER); unsafe { let l_tbl = _mm_lddqu_si128(GF256_SIMD_MUL_TABLE_LOW[scalar as usize].as_ptr().cast()); @@ -19,17 +19,53 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { let l_mask = _mm_set1_epi8(0x0f); for chunk in iter.by_ref() { - let chunk_simd = _mm_lddqu_si128(chunk.as_ptr().cast()); - - let chunk_simd_lo = _mm_and_si128(chunk_simd, l_mask); - let chunk_simd_lo = _mm_shuffle_epi8(l_tbl, chunk_simd_lo); - - let chunk_simd_hi = _mm_srli_epi64(chunk_simd, 4); - let chunk_simd_hi = _mm_and_si128(chunk_simd_hi, l_mask); - let chunk_simd_hi = _mm_shuffle_epi8(h_tbl, chunk_simd_hi); - - let res = _mm_xor_si128(chunk_simd_lo, chunk_simd_hi); - _mm_storeu_si128(chunk.as_mut_ptr().cast(), res); + let (chunk0, chunk1, chunk2, chunk3) = { + let (chunk0, rest) = chunk.split_at_mut_unchecked(GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_simd = _mm_lddqu_si128(chunk0.as_ptr().cast()); + let chunk1_simd = _mm_lddqu_si128(chunk1.as_ptr().cast()); + let chunk2_simd = _mm_lddqu_si128(chunk2.as_ptr().cast()); + let chunk3_simd = _mm_lddqu_si128(chunk3.as_ptr().cast()); + + let chunk0_simd_lo = _mm_and_si128(chunk0_simd, l_mask); + let chunk1_simd_lo = _mm_and_si128(chunk1_simd, l_mask); + let chunk2_simd_lo = _mm_and_si128(chunk2_simd, l_mask); + let chunk3_simd_lo = _mm_and_si128(chunk3_simd, l_mask); + + let chunk0_simd_lo = _mm_shuffle_epi8(l_tbl, chunk0_simd_lo); + let chunk1_simd_lo = _mm_shuffle_epi8(l_tbl, chunk1_simd_lo); + let chunk2_simd_lo = _mm_shuffle_epi8(l_tbl, chunk2_simd_lo); + let chunk3_simd_lo = _mm_shuffle_epi8(l_tbl, chunk3_simd_lo); + + let chunk0_simd_hi = _mm_srli_epi64(chunk0_simd, 4); + let chunk1_simd_hi = _mm_srli_epi64(chunk1_simd, 4); + let chunk2_simd_hi = _mm_srli_epi64(chunk2_simd, 4); + let chunk3_simd_hi = _mm_srli_epi64(chunk3_simd, 4); + + let chunk0_simd_hi = _mm_and_si128(chunk0_simd_hi, l_mask); + let chunk1_simd_hi = _mm_and_si128(chunk1_simd_hi, l_mask); + let chunk2_simd_hi = _mm_and_si128(chunk2_simd_hi, l_mask); + let chunk3_simd_hi = _mm_and_si128(chunk3_simd_hi, l_mask); + + let chunk0_simd_hi = _mm_shuffle_epi8(h_tbl, chunk0_simd_hi); + let chunk1_simd_hi = _mm_shuffle_epi8(h_tbl, chunk1_simd_hi); + let chunk2_simd_hi = _mm_shuffle_epi8(h_tbl, chunk2_simd_hi); + let chunk3_simd_hi = _mm_shuffle_epi8(h_tbl, chunk3_simd_hi); + + let res0 = _mm_xor_si128(chunk0_simd_lo, chunk0_simd_hi); + let res1 = _mm_xor_si128(chunk1_simd_lo, chunk1_simd_hi); + let res2 = _mm_xor_si128(chunk2_simd_lo, chunk2_simd_hi); + let res3 = _mm_xor_si128(chunk3_simd_lo, chunk3_simd_hi); + + _mm_storeu_si128(chunk0.as_mut_ptr().cast(), res0); + _mm_storeu_si128(chunk1.as_mut_ptr().cast(), res1); + _mm_storeu_si128(chunk2.as_mut_ptr().cast(), res2); + _mm_storeu_si128(chunk3.as_mut_ptr().cast(), res3); } } @@ -40,31 +76,58 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { #[target_feature(enable = "ssse3")] pub unsafe fn add_vec_into(vec_dst: &mut [u8], vec_src: &[u8]) { - let mut iter_dst = vec_dst.chunks_exact_mut(GF256_HALF_ORDER); - let mut iter_src = vec_src.chunks_exact(GF256_HALF_ORDER); + let mut iter_dst = vec_dst.chunks_exact_mut(4 * GF256_HALF_ORDER); + let mut iter_src = vec_src.chunks_exact(4 * GF256_HALF_ORDER); unsafe { for (chunk_dst, chunk_src) in iter_dst.by_ref().zip(iter_src.by_ref()) { - let chunk_dst_simd = _mm_lddqu_si128(chunk_dst.as_ptr().cast()); - let chunk_src_simd = _mm_lddqu_si128(chunk_src.as_ptr().cast()); - let chunk_result = _mm_xor_si128(chunk_dst_simd, chunk_src_simd); - - _mm_storeu_si128(chunk_dst.as_mut_ptr().cast(), chunk_result); + let (chunk0_dst, chunk1_dst, chunk2_dst, chunk3_dst) = { + let (chunk0, rest) = chunk_dst.split_at_mut_unchecked(GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let (chunk0_src, chunk1_src, chunk2_src, chunk3_src) = { + let (chunk0, rest) = chunk_src.split_at_unchecked(GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let chunk0_dst_simd = _mm_lddqu_si128(chunk0_dst.as_ptr().cast()); + let chunk0_src_simd = _mm_lddqu_si128(chunk0_src.as_ptr().cast()); + let chunk0_result = _mm_xor_si128(chunk0_dst_simd, chunk0_src_simd); + _mm_storeu_si128(chunk0_dst.as_mut_ptr().cast(), chunk0_result); + + let chunk1_dst_simd = _mm_lddqu_si128(chunk1_dst.as_ptr().cast()); + let chunk1_src_simd = _mm_lddqu_si128(chunk1_src.as_ptr().cast()); + let chunk1_result = _mm_xor_si128(chunk1_dst_simd, chunk1_src_simd); + _mm_storeu_si128(chunk1_dst.as_mut_ptr().cast(), chunk1_result); + + let chunk2_dst_simd = _mm_lddqu_si128(chunk2_dst.as_ptr().cast()); + let chunk2_src_simd = _mm_lddqu_si128(chunk2_src.as_ptr().cast()); + let chunk2_result = _mm_xor_si128(chunk2_dst_simd, chunk2_src_simd); + _mm_storeu_si128(chunk2_dst.as_mut_ptr().cast(), chunk2_result); + + let chunk3_dst_simd = _mm_lddqu_si128(chunk3_dst.as_ptr().cast()); + let chunk3_src_simd = _mm_lddqu_si128(chunk3_src.as_ptr().cast()); + let chunk3_result = _mm_xor_si128(chunk3_dst_simd, chunk3_src_simd); + _mm_storeu_si128(chunk3_dst.as_mut_ptr().cast(), chunk3_result); } } - let remainder_dst = iter_dst.into_remainder(); - let remainder_src = iter_src.remainder(); - - remainder_dst.iter_mut().zip(remainder_src).for_each(|(a, b)| { + iter_dst.into_remainder().iter_mut().zip(iter_src.remainder()).for_each(|(a, b)| { *a ^= b; }); } #[target_feature(enable = "ssse3")] pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { - let mut add_vec_iter = add_into_vec.chunks_exact_mut(GF256_HALF_ORDER); - let mut mul_vec_iter = mul_vec.chunks_exact(GF256_HALF_ORDER); + let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(4 * GF256_HALF_ORDER); unsafe { let l_tbl = _mm_lddqu_si128(GF256_SIMD_MUL_TABLE_LOW[scalar as usize].as_ptr().cast()); @@ -72,21 +135,66 @@ pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: let l_mask = _mm_set1_epi8(0x0f); for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { - let mul_vec_chunk_simd = _mm_lddqu_si128(mul_vec_chunk.as_ptr().cast()); - - let chunk_simd_lo = _mm_and_si128(mul_vec_chunk_simd, l_mask); - let chunk_simd_lo = _mm_shuffle_epi8(l_tbl, chunk_simd_lo); - - let chunk_simd_hi = _mm_srli_epi64(mul_vec_chunk_simd, 4); - let chunk_simd_hi = _mm_and_si128(chunk_simd_hi, l_mask); - let chunk_simd_hi = _mm_shuffle_epi8(h_tbl, chunk_simd_hi); - - let scaled_res = _mm_xor_si128(chunk_simd_lo, chunk_simd_hi); - - let add_vec_chunk_simd = _mm_lddqu_si128(add_vec_chunk.as_ptr().cast()); - let accum_res = _mm_xor_si128(add_vec_chunk_simd, scaled_res); - - _mm_storeu_si128(add_vec_chunk.as_mut_ptr().cast(), accum_res); + let (mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = { + let (chunk0, rest) = mul_vec_chunk.split_at_unchecked(GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_unchecked(GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_unchecked(GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let mul_vec_chunk0_simd = _mm_lddqu_si128(mul_vec_chunk0.as_ptr().cast()); + let mul_vec_chunk1_simd = _mm_lddqu_si128(mul_vec_chunk1.as_ptr().cast()); + let mul_vec_chunk2_simd = _mm_lddqu_si128(mul_vec_chunk2.as_ptr().cast()); + let mul_vec_chunk3_simd = _mm_lddqu_si128(mul_vec_chunk3.as_ptr().cast()); + + let chunk0_simd_lo = _mm_and_si128(mul_vec_chunk0_simd, l_mask); + let chunk1_simd_lo = _mm_and_si128(mul_vec_chunk1_simd, l_mask); + let chunk2_simd_lo = _mm_and_si128(mul_vec_chunk2_simd, l_mask); + let chunk3_simd_lo = _mm_and_si128(mul_vec_chunk3_simd, l_mask); + + let chunk0_simd_lo = _mm_shuffle_epi8(l_tbl, chunk0_simd_lo); + let chunk1_simd_lo = _mm_shuffle_epi8(l_tbl, chunk1_simd_lo); + let chunk2_simd_lo = _mm_shuffle_epi8(l_tbl, chunk2_simd_lo); + let chunk3_simd_lo = _mm_shuffle_epi8(l_tbl, chunk3_simd_lo); + + let chunk0_simd_hi = _mm_and_si128(_mm_srli_epi64(mul_vec_chunk0_simd, 4), l_mask); + let chunk1_simd_hi = _mm_and_si128(_mm_srli_epi64(mul_vec_chunk1_simd, 4), l_mask); + let chunk2_simd_hi = _mm_and_si128(_mm_srli_epi64(mul_vec_chunk2_simd, 4), l_mask); + let chunk3_simd_hi = _mm_and_si128(_mm_srli_epi64(mul_vec_chunk3_simd, 4), l_mask); + + let chunk0_simd_hi = _mm_shuffle_epi8(h_tbl, chunk0_simd_hi); + let chunk1_simd_hi = _mm_shuffle_epi8(h_tbl, chunk1_simd_hi); + let chunk2_simd_hi = _mm_shuffle_epi8(h_tbl, chunk2_simd_hi); + let chunk3_simd_hi = _mm_shuffle_epi8(h_tbl, chunk3_simd_hi); + + let scaled_res0 = _mm_xor_si128(chunk0_simd_lo, chunk0_simd_hi); + let scaled_res1 = _mm_xor_si128(chunk1_simd_lo, chunk1_simd_hi); + let scaled_res2 = _mm_xor_si128(chunk2_simd_lo, chunk2_simd_hi); + let scaled_res3 = _mm_xor_si128(chunk3_simd_lo, chunk3_simd_hi); + + let (add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = { + let (chunk0, rest) = add_vec_chunk.split_at_mut_unchecked(GF256_HALF_ORDER); + let (chunk1, rest) = rest.split_at_mut_unchecked(GF256_HALF_ORDER); + let (chunk2, chunk3) = rest.split_at_mut_unchecked(GF256_HALF_ORDER); + + (chunk0, chunk1, chunk2, chunk3) + }; + + let add_vec_chunk0_simd = _mm_lddqu_si128(add_vec_chunk0.as_ptr().cast()); + let add_vec_chunk1_simd = _mm_lddqu_si128(add_vec_chunk1.as_ptr().cast()); + let add_vec_chunk2_simd = _mm_lddqu_si128(add_vec_chunk2.as_ptr().cast()); + let add_vec_chunk3_simd = _mm_lddqu_si128(add_vec_chunk3.as_ptr().cast()); + + let accum_res0 = _mm_xor_si128(add_vec_chunk0_simd, scaled_res0); + let accum_res1 = _mm_xor_si128(add_vec_chunk1_simd, scaled_res1); + let accum_res2 = _mm_xor_si128(add_vec_chunk2_simd, scaled_res2); + let accum_res3 = _mm_xor_si128(add_vec_chunk3_simd, scaled_res3); + + _mm_storeu_si128(add_vec_chunk0.as_mut_ptr().cast(), accum_res0); + _mm_storeu_si128(add_vec_chunk1.as_mut_ptr().cast(), accum_res1); + _mm_storeu_si128(add_vec_chunk2.as_mut_ptr().cast(), accum_res2); + _mm_storeu_si128(add_vec_chunk3.as_mut_ptr().cast(), accum_res3); } } diff --git a/src/lib.rs b/src/lib.rs index 366656a..c5391a7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,9 +115,9 @@ //! //! ```toml //! [dependencies] -//! rlnc = "=0.8.6" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. +//! rlnc = "=0.8.7" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. //! # or -//! rlnc = { version = "=0.8.6", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding. Decoding is not yet parallelized. +//! rlnc = { version = "=0.8.7", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding. Decoding is not yet parallelized. //! //! rand = { version = "=0.9.1" } # Required for random number generation //! ```