tevador · SChernykh · Feb 17, 2026 · Sep 7, 2023 · Jan 8, 2026 · Jan 8, 2026
diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
@@ -62,6 +62,7 @@ jobs:
       matrix:
         config:
           - {os: ubuntu-24.04, c: gcc-14, cpp: g++-14}
+          - {os: ubuntu-24.04-arm, c: gcc-14, cpp: g++-14}
 
     steps:
     - name: Install dependencies
@@ -164,23 +165,20 @@ jobs:
 
     strategy:
       matrix:
-        os: [macos-14, macos-15]
+        os: [macos-15, macos-15-intel]
 
     steps:
     - name: Checkout repository
       uses: actions/checkout@v3
       with:
         submodules: recursive
 
-    - name: Install dependencies
-      run: HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake
-
     - name: Build RandomX
       run: |
         mkdir build
         cd build
         cmake ..
-        make -j3
+        make -j4
 
     - name: Run tests
       run: |

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -316,7 +316,7 @@ if(ARCH_ID STREQUAL "riscv64")
     endif()
   endif()
 
-  set_source_files_properties(src/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}")
+  set_source_files_properties(src/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}_zvkned")
   set_source_files_properties(src/aes_hash_rv64_vector.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}")
   set_source_files_properties(src/aes_hash_rv64_zvkned.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}_zvkned")
 endif()
@@ -380,5 +380,13 @@ if(NOT HAVE_CXX_ATOMICS)
   target_link_libraries(randomx-benchmark
     PRIVATE "atomic")
 endif()
+
+check_cxx_source_compiles("
+#include <stdlib.h>
+int main() {
+  void* p;
+  return posix_memalign(&p, 64, 64);
+}" HAVE_POSIX_MEMALIGN)
+
 set_property(TARGET randomx-benchmark PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET randomx-benchmark PROPERTY CXX_STANDARD 11)
diff --git a/doc/configuration.md b/doc/configuration.md
@@ -14,7 +14,7 @@ These parameters can be modified in source file [configuration.h](../src/configu
 |`RANDOMX_SUPERSCALAR_LATENCY`|Target latency for SuperscalarHash (in cycles of the reference CPU)|`170`|
 |`RANDOMX_DATASET_BASE_SIZE`|Dataset base size in bytes|`2147483648`|
 |`RANDOMX_DATASET_EXTRA_SIZE`|Dataset extra size in bytes|`33554368`|
-|`RANDOMX_PROGRAM_SIZE`|The number of instructions in a RandomX program|`256`|
+|`RANDOMX_PROGRAM_SIZE`|The number of instructions in a RandomX program|v1: `256`, v2: `384`|
 |`RANDOMX_PROGRAM_ITERATIONS`|The number of iterations per program|`2048`|
 |`RANDOMX_PROGRAM_COUNT`|The number of programs per hash|`8`|
 |`RANDOMX_JUMP_BITS`|Jump condition mask size in bits|`8`|

diff --git a/doc/design.md b/doc/design.md
@@ -191,7 +191,7 @@ Approximate distribution of floating point register values at the end of each pr
 
 The small number of F register values at `1e+14` is caused by the FSCAL instruction, which significantly increases the range of the register values.
 
-Group E registers cover a very large range of values. About 2% of programs produce at least one `infinity` value.
+Group E registers cover a very large range of values. About 2% (6.85% for RandomX v2) of programs produce at least one `infinity` value.
 
 To maximize entropy and also to fit into one 64-byte cache line, floating point registers are combined using the XOR operation at the end of each iteration before being stored into the Scratchpad.
 
@@ -254,13 +254,18 @@ The Scratchpad is split into 3 levels to mimic the typical CPU cache hierarchy [
 |CPU μ-architecture|L1 latency|L2 latency|L3 latency|source|
 |----------------|----------|----------|----------|------|
 ARM Cortex A55|2|6|-|[[24](https://www.anandtech.com/show/11441/dynamiq-and-arms-new-cpus-cortex-a75-a55/4)]
-|AMD Zen+|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
+|AMD Zen+, Zen 2|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
+|AMD Zen 3/4|4|14|47|[[31](https://agner.org/optimize/microarchitecture.pdf)]|
+|AMD Zen 5|4|14|55|[[31](https://agner.org/optimize/microarchitecture.pdf)]|
 |Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy)]
+|Intel Alder Lake P core|5|15|65|[[31](https://agner.org/optimize/microarchitecture.pdf)]|
 
 The L3 cache is much larger and located further from the CPU core. As a result, its access latencies are much higher and can cause stalls in program execution.
 
 RandomX therefore performs only 2 random accesses into "L3" Scratchpad per program iteration (steps 2 and 3 in chapter 4.6.2 of the Specification). Register values from a given iteration are written into the same locations they were loaded from, which guarantees that the required cache lines have been moved into the faster L1 or L2 caches.
 
+RandomX v2 partially hides the stalls from accessing "L3" Scratchpad by doing an extended mixing of group F and group E registers (step 10 in chapter 4.6.2 of the Specification). This keeps the CPU busy at all times so it doesn't waste energy while waiting for data.
+
 Additionally, integer instructions that read from a fixed address also use the whole "L3" Scratchpad (Table 5.1.4 of the Specification) because repetitive accesses will ensure that the cache line will be placed in the L1 cache of the CPU. This shows that the Scratchpad level doesn't always directly correspond to the same CPU cache level.
 
 #### 2.8.2 Scratchpad writes
@@ -648,3 +653,5 @@ Cryptocurrencies and Password Hashing - https://eprint.iacr.org/2015/430.pdf Tab
 [29] 7-Zip File archiver - https://www.7-zip.org/
 
 [30] TestU01 library - http://simul.iro.umontreal.ca/testu01/tu01.html
+
+[31] Agner Fog, The microarchitecture of Intel, AMD, and VIA CPUs - https://agner.org/optimize/microarchitecture.pdf
diff --git a/doc/design_v2.md b/doc/design_v2.md
@@ -0,0 +1,111 @@
+# RandomX v2 changes and their rationale
+
+## 1. CFROUND tweak
+
+In RandomX v1, CFROUND instruction changes the rounding mode on each main loop iteration. Unfortunately, x86 CPUs were not designed for the rounding mode changing that often. As a result, this single instruction costs up to 10% of hashrate on Ryzen CPUs. This is where an ASIC or a specially designed CPU can get an easy advantage.
+
+RandomX v2 reduces the impact massively: CFROUND will now change the rounding mode only every 16th time it executes (on average).
+
+## 2. AES tweak
+
+F and E registers are now mixed together with AES instead of XOR (step 10 in chapter 4.6.2 of the spec).
+
+- AES tweak doubles the amount of AES computations per hash without hurting the hashrate (it uses the gap in RandomX main loop where the CPU was sitting idle, waiting for scratchpad data).
+- AES tweak also introduces AES in the main RandomX loop which makes it harder for specialized hardware to get away with just a dedicated circuit for scratchpad intialization - AES must be implemented as a part of RandomX VM and work with RandomX VM's registers.
+- AES tweak also improves data entropy (makes it more random) before it's written to the scratchpad.
+
+## 3. Program size increase from 256 to 384
+
+CPUs got much faster since the original RandomX was released. Back in 2019, Ryzen 9 3950X was the fastest desktop CPU for RandomX, and at the time of writing (January 2026) it's Ryzen 9 9950X. In most CPU benchmarks, 9950X is more than 1.5x faster on average - thanks to clock speed increase from < 4 to > 5 GHz, and to IPC improvements.
+
+But in RandomX it's only 20-25% faster, because it's bottlenecked by the RAM latency. While CPU cores got faster over the years, RAM latency stayed basically the same - a tuned DDR4 memory from 2019, and a tuned DDR5 memory from 2026 will both have the same access latency of around 50-55 ns.
+
+This imbalance is the main reason of the program size increase - Zen5 and newer CPUs need more work to keep themselves busy while they're waiting for data from memory.
+
+## 4. Prefetch two main loop iterations ahead instead of just one
+
+RandomX v1 prefetches data from the dataset one iteration ahead. RandomX v2 increases it to two iterations by redefining the prefetch logic (see the `mp` register in specs.md).
+
+This change complements the program size increase tweak and has the same purpose.
+
+## 4. Performance impact
+
+Tests show that RandomX v2, while being more than 1.5 times "heavier" than RandomX v1, results in only a slight hashrate reduction but massive efficiency improvements (in terms of VM+AES instructions per Joule):
+
+### AMD Ryzen 9 9950X (Zen 5) @ 285W (PBO max)
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|27186.1|100.0%|95.38|121.15e9|425.1e6|100.0%|
+RandomX v2|26791.7|98.55%|94.01|182.61e9|640.72e6|**150.72%**|
+
+### AMD Ryzen 9 9950X (Zen 5) @ 100W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|19912.2|100.0%|199.122|88.74e9|887.38e6|100.0%|
+RandomX v2|17346.2|87.11%|173.462|118.23e9|1182.27e6|**133.23%**|
+
+### AMD Ryzen AI 9 HX 370 (Zen 5), DDR5-5600 @ 28W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|6597.15|100.0%|235.61|29.4e9|1050e6|100.0%|
+RandomX v2|7121.69|107.95%|254.35|48.54e9|1733.56e6|**165.1%**|
+
+### Ryzen AI 9 365 (Zen 5) @ 28W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|6091|100.0%|217.5|27.14e9|969.44e6|100.0%|
+RandomX v2|6649|109.2%|237.5|45.32e9|1618.5e6|**166.95%**|
+
+### Ryzen 9 7945HX (Zen 4) @ 62W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|16126|100.0%|260.1|71.86e9|1159.11e6|100.0%|
+RandomX v2|15308|94.9%|246.9|104.33e9|1682.83e6|**145.18%**|
+
+### Ryzen 5 8600G (Zen 4) @ 45W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|5876.47|100.0%|130.59|26.19e9|581.96e6|100.0%|
+RandomX v2|5375.29|91.5%|119.45|36.64e9|814.15e6|**139.9%**|
+
+### Ryzen 9 5950X (Zen 3) @ 122-126W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|14745.9 @ 126W|100.0%|117.03|65.71e9|521.54e6|100.0%|
+RandomX v2|12905.3 @ 122W|87.5%|105.78|87.96e9|720.98e6|**138.2%**|
+
+### Ryzen 9 3950X (Zen 2) @ 131W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|15049.34|100.0%|114.88|67.07e9|511.96e6|100.0%|
+RandomX v2|13868.64|92.15%|105.87|94.53e9|721.57e6|**140.94%**|
+
+### Ryzen 7 3700X (Zen 2) @ 88W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|8624|100.0%|98|38.43e9|436.73e6|100.0%|
+RandomX v2|7361|85.35%|83.65|50.17e9|570.12e6|**130.54%**|
+
+### Ryzen 7 1700X (Zen 1) @ 95W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|4832.73|100.0%|50.87|21.54e9|226.7e6|100.0%|
+RandomX v2|4870.41|100.78%|51.27|33.2e9|349.43e6|**154.13%**|
+
+### Intel Core i9-12900K @ 125W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|8644.47|100.0%|69.16|38.52e9|308.19e6|100.0%|
+RandomX v2|8310.78|96.14%|66.49|56.64e9|453.15e6|**147.04%**|
+
+### Intel Core i7-8650U @ 15W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|1831.15|100.0%|122.08|8.16e9|544.03e6|100.0%|
+RandomX v2|1415|77.27%|94.33|9.64e9|642.95e6|**118.18%**|
+
+### Intel Core i7-6820HQ @ 45W
+|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
+|-|-|-|-|-|-|-|
+RandomX v1|1968.56|100.0%|43.75|8.77e9|194.95e6|100.0%|
+RandomX v2|1488.25|75.6%|33.07|10.14e9|225.41e6|**115.62%**|
diff --git a/doc/specs.md b/doc/specs.md
@@ -319,7 +319,10 @@ To access a particular scratchpad level, bitwise AND with a mask according to ta
 
 The VM has 8 integer registers `r0`-`r7` (group R) and a total of 12 floating point registers split into 3 groups: `f0`-`f3` (group F), `e0`-`e3` (group E) and `a0`-`a3` (group A). Integer registers are 64 bits wide, while floating point registers are 128 bits wide and contain a pair of numbers in floating point format. The lower and upper half of floating point registers are not separately addressable.
 
-Additionally, there are 3 internal registers `ma`, `mx` and `fprc`.
+Additionally, there are 4 internal registers `ma`, `mx`, `mt` and `fprc`.
+
+- RandomX v1: `mp` is a name alias for `mx`
+- RandomX v2: `mp` is a name alias for `ma`
 
 Integer registers `r0`-`r7` can be the source or the destination operands of integer instructions or may be used as address registers for accessing the Scratchpad.
 
@@ -447,12 +450,14 @@ The loop described below is repeated until the value of the `ic` register reache
 2. `spAddr0` is used to perform a 64-byte aligned read from Scratchpad level 3 (using mask from Table 4.2.1). The 64 bytes are XORed with all integer registers in order `r0`-`r7`.
 3. `spAddr1` is used to perform a 64-byte aligned read from Scratchpad level 3 (using mask from Table 4.2.1). Each floating point register `f0`-`f3` and `e0`-`e3` is initialized using an 8-byte value according to the conversion rules from chapters 4.3.1 and 4.3.2.
 4. The 256 instructions stored in the Program Buffer are executed.
-5. The `mx` register is XORed with the low 32 bits of registers `readReg2` and `readReg3` (see Table 4.5.3).
-6. A 64-byte Dataset item at address `datasetOffset + mx % RANDOMX_DATASET_BASE_SIZE` is prefetched from the Dataset (it will be used during the next iteration).
-7. A 64-byte Dataset item at address `datasetOffset + ma % RANDOMX_DATASET_BASE_SIZE` is loaded from the Dataset. The 64 bytes are XORed with all integer registers in order `r0`-`r7`.
+5. The value of `ma` is saved in `mt`. Then the `mp` register is XORed with the low 32 bits of registers `readReg2` and `readReg3` (see Table 4.5.3).
+6. A 64-byte Dataset item at address `datasetOffset + mp % RANDOMX_DATASET_BASE_SIZE` is prefetched from the Dataset (it will be used during the next iteration(s)).
+7. A 64-byte Dataset item at address `datasetOffset + mt % RANDOMX_DATASET_BASE_SIZE` is loaded from the Dataset. The 64 bytes are XORed with all integer registers in order `r0`-`r7`.
 8. The values of registers `mx` and `ma` are swapped.
 9. The values of all integer registers `r0`-`r7` are written to the Scratchpad (L3) at address `spAddr1` (64-byte aligned).
-10. Register `f0` is XORed with register `e0` and the result is stored in register `f0`. Register `f1` is XORed with register `e1` and the result is stored in register `f1`. Register `f2` is XORed with register `e2` and the result is stored in register `f2`. Register `f3` is XORed with register `e3` and the result is stored in register `f3`.
+10. Group F registers are mixed with group E registers.
+- **RandomX v1:** `fi = fi XOR ei` for i = 0,1,2,3
+- **RandomX v2:** `f0 = AES encrypt of f0 with e0 as key`,`f1 = AES decrypt of f1 with e0 as key`,`f2 = AES encrypt of f2 with e0 as key`,`f3 = AES decrypt of f3 with e0 as key`. These steps are repeated with `e1`,`e2`,`e3` as keys.
 11. The values of registers `f0`-`f3` are written to the Scratchpad (L3) at address `spAddr0` (64-byte aligned).
 12. `spAddr0` and `spAddr1` are both set to zero.
 13. `ic` is decreased by 1.
@@ -667,7 +672,12 @@ There are 2 control instructions.
 |25/256|CBRANCH|R|-|`dst = dst + cimm`, conditional jump
 
 #### 5.4.1 CFROUND
-This instruction calculates a 2-bit value by rotating the source register right by `imm32` bits and taking the 2 least significant bits (the value of the source register is unaffected). The result is stored in the `fprc` register. This changes the rounding mode of all subsequent floating point instructions.
+This instruction calculates a 2-bit value by rotating the source register right by `imm32` bits and taking the 2 least significant bits (the value of the source register is unaffected).
+
+- **RandomX v1**: bits 0-1 of the result are stored in the `fprc` register.
+- **RandomX v2**: if bits 2-5 of the result are 0, bits 0-1 of the result are stored in the `fprc` register
+
+This sets the rounding mode of all subsequent floating point instructions.
 
 #### 5.4.2 CBRANCH
 

diff --git a/src/aes_hash.cpp b/src/aes_hash.cpp
@@ -68,7 +68,7 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
 	assert(inputSize % 64 == 0);
 
 #ifdef __riscv
-	if (randomx::cpu.hasAes()) {
+	if (!softAes) {
 		hashAes1Rx4_zvkned(input, inputSize, hash);
 		return;
 	}
@@ -153,7 +153,7 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
 	assert(outputSize % 64 == 0);
 
 #ifdef __riscv
-	if (randomx::cpu.hasAes()) {
+	if (!softAes) {
 		fillAes1Rx4_zvkned(state, outputSize, buffer);
 		return;
 	}
@@ -221,7 +221,7 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	assert(outputSize % 64 == 0);
 
 #ifdef __riscv
-	if (randomx::cpu.hasAes()) {
+	if (!softAes) {
 		fillAes4Rx4_zvkned(state, outputSize, buffer);
 		return;
 	}
@@ -288,7 +288,7 @@ template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
 template<bool softAes>
 void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
 #ifdef __riscv
-	if (randomx::cpu.hasAes()) {
+	if (!softAes) {
 		hashAndFillAes1Rx4_zvkned(scratchpad, scratchpadSize, hash, fill_state);
 		return;
 	}

diff --git a/src/aes_hash_rv64_vector.cpp b/src/aes_hash_rv64_vector.cpp
@@ -70,6 +70,19 @@ static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee
 static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
 static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 12, 8, 4, 0, 76, 72, 68, 64 };
 
+#define lutEnc0 randomx_aes_lut_enc[0]
+#define lutEnc1 randomx_aes_lut_enc[1]
+#define lutEnc2 randomx_aes_lut_enc[2]
+#define lutEnc3 randomx_aes_lut_enc[3]
+
+#define lutDec0 randomx_aes_lut_dec[0]
+#define lutDec1 randomx_aes_lut_dec[1]
+#define lutDec2 randomx_aes_lut_dec[2]
+#define lutDec3 randomx_aes_lut_dec[3]
+
+#define lutEncIndex randomx_aes_lut_enc_index
+#define lutDecIndex randomx_aes_lut_dec_index
+
 void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) {
 	const uint8_t* inptr = (const uint8_t*)input;
 	const uint8_t* inputEnd = inptr + inputSize;

diff --git a/src/asm/configuration.asm b/src/asm/configuration.asm
@@ -7,7 +7,9 @@ RANDOMX_CACHE_ACCESSES EQU 8t
 RANDOMX_SUPERSCALAR_LATENCY EQU 170t
 RANDOMX_DATASET_BASE_SIZE EQU 2147483648t
 RANDOMX_DATASET_EXTRA_SIZE EQU 33554368t
-RANDOMX_PROGRAM_SIZE EQU 256t
+RANDOMX_PROGRAM_SIZE_V1 EQU 256t
+RANDOMX_PROGRAM_SIZE_V2 EQU 384t
+RANDOMX_PROGRAM_MAX_SIZE EQU 384t
 RANDOMX_PROGRAM_ITERATIONS EQU 2048t
 RANDOMX_PROGRAM_COUNT EQU 8t
 RANDOMX_SCRATCHPAD_L3 EQU 2097152t

diff --git a/src/asm/program_epilogue_linux.inc b/src/asm/program_epilogue_linux.inc
@@ -1,10 +1,12 @@
 	;# restore callee-saved registers - System V AMD64 ABI
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rbp
-	pop rbx
+	mov r15, qword ptr [rsp+280]
+	mov r14, qword ptr [rsp+272]
+	mov r13, qword ptr [rsp+264]
+	mov r12, qword ptr [rsp+256]
+	mov rbp, qword ptr [rsp+232]
+	mov rbx, qword ptr [rsp+224]
+
+	add rsp, 456
 
 	;# program finished
-	ret 0
+	ret
diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc
@@ -1,5 +1,5 @@
 	;# save VM register values
-	pop rcx
+	mov rcx, qword ptr [rsp+448]
 	mov qword ptr [rcx+0], r8
 	mov qword ptr [rcx+8], r9
 	mov qword ptr [rcx+16], r10

diff --git a/src/asm/program_epilogue_win64.inc b/src/asm/program_epilogue_win64.inc
@@ -1,24 +1,24 @@
 	;# restore callee-saved registers - Microsoft x64 calling convention
-	movdqu xmm15, xmmword ptr [rsp]
-	movdqu xmm14, xmmword ptr [rsp+16]
-	movdqu xmm13, xmmword ptr [rsp+32]
-	movdqu xmm12, xmmword ptr [rsp+48]
-	movdqu xmm11, xmmword ptr [rsp+64]
-	add rsp, 80
-	movdqu xmm10, xmmword ptr [rsp]
-	movdqu xmm9, xmmword ptr [rsp+16]
-	movdqu xmm8, xmmword ptr [rsp+32]
-	movdqu xmm7, xmmword ptr [rsp+48]
-	movdqu xmm6, xmmword ptr [rsp+64]
-	add rsp, 80
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rsi
-	pop rdi
-	pop rbp
-	pop rbx
+	movdqa xmm15, xmmword ptr [rsp+432]
+	movdqa xmm14, xmmword ptr [rsp+416]
+	movdqa xmm13, xmmword ptr [rsp+400]
+	movdqa xmm12, xmmword ptr [rsp+384]
+	movdqa xmm11, xmmword ptr [rsp+368]
+	movdqa xmm10, xmmword ptr [rsp+352]
+	movdqa xmm9, xmmword ptr [rsp+336]
+	movdqa xmm8, xmmword ptr [rsp+320]
+	movdqa xmm7, xmmword ptr [rsp+304]
+	movdqa xmm6, xmmword ptr [rsp+288]
+	mov r15, qword ptr [rsp+280]
+	mov r14, qword ptr [rsp+272]
+	mov r13, qword ptr [rsp+264]
+	mov r12, qword ptr [rsp+256]
+	mov rdi, qword ptr [rsp+248]
+	mov rsi, qword ptr [rsp+240]
+	mov rbp, qword ptr [rsp+232]
+	mov rbx, qword ptr [rsp+224]
+
+	add rsp, 456
 
 	;# program finished
 	ret