Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
1115ec9
RandomX v2 virtual machine changes
tevador Sep 7, 2023
fce33ee
Interpreter support for v2, tests
SChernykh Jan 8, 2026
da7af41
Enabled CI
SChernykh Jan 8, 2026
ebe57ef
RISC-V: added CFROUND v2
SChernykh Jan 10, 2026
3f46cd7
RISC-V: added v2 FE mix code (hardware AES)
SChernykh Jan 10, 2026
8c80f3a
RISC-V: added v2 FE mix code (software AES)
SChernykh Jan 11, 2026
19cd192
API to switch between v1 and v2 on the fly
SChernykh Jan 11, 2026
02f6c4d
RISC-V: added v2 FE mix code (scalar software AES)
SChernykh Jan 12, 2026
4821dbf
Set v2 program size to 384
SChernykh Jan 12, 2026
5891f5d
Improved RISC-V code
SChernykh Jan 13, 2026
67df5bd
Updated documentation for v2
SChernykh Jan 13, 2026
6d915f6
Improved RISC-V code
SChernykh Jan 14, 2026
1cefbd7
Added v2 design doc
SChernykh Jan 16, 2026
6346aa4
Added more CPU benchmarks
SChernykh Jan 17, 2026
4d3c4f2
Added prefetch tweak (x64 only for now)
SChernykh Jan 23, 2026
52406c6
Fixed v2 prefetch code
SChernykh Jan 23, 2026
d2a2e93
Prefetch first 2 iterations of the loop
SChernykh Jan 23, 2026
d82f065
Prefetch tweak (RISC-V)
SChernykh Jan 24, 2026
ddfd0c4
Prefetch tweak (aarch64)
SChernykh Jan 24, 2026
0fcd8f6
Updated CPU tests
SChernykh Jan 26, 2026
b4a374a
Cleanup
SChernykh Jan 26, 2026
4bca419
Added more CPU tests
SChernykh Jan 27, 2026
1cd1e8f
Update configuration.md
SChernykh Jan 27, 2026
6032448
Added 9950X tests
SChernykh Jan 28, 2026
1a9f14b
ARM64: removed duplicate AES tables
SChernykh Feb 3, 2026
6dca84f
Fixed typo
SChernykh Feb 14, 2026
3842469
ARM64: init AES pointers in a safe way
SChernykh Feb 14, 2026
3fd1d7c
Clarified temporary register use
SChernykh Feb 14, 2026
5c00375
Implemented `rx_aligned_alloc` for portable fallback
SChernykh Feb 14, 2026
2b9ab3e
Fixed misaligned access
SChernykh Feb 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ jobs:
matrix:
config:
- {os: ubuntu-24.04, c: gcc-14, cpp: g++-14}
- {os: ubuntu-24.04-arm, c: gcc-14, cpp: g++-14}

steps:
- name: Install dependencies
Expand Down Expand Up @@ -164,23 +165,20 @@ jobs:

strategy:
matrix:
os: [macos-14, macos-15]
os: [macos-15, macos-15-intel]

steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
submodules: recursive

- name: Install dependencies
run: HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake

- name: Build RandomX
run: |
mkdir build
cd build
cmake ..
make -j3
make -j4

- name: Run tests
run: |
Expand Down
10 changes: 9 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ if(ARCH_ID STREQUAL "riscv64")
endif()
endif()

set_source_files_properties(src/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}")
set_source_files_properties(src/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}_zvkned")
set_source_files_properties(src/aes_hash_rv64_vector.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}")
set_source_files_properties(src/aes_hash_rv64_zvkned.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}_zvkned")
endif()
Expand Down Expand Up @@ -380,5 +380,13 @@ if(NOT HAVE_CXX_ATOMICS)
target_link_libraries(randomx-benchmark
PRIVATE "atomic")
endif()

check_cxx_source_compiles("
#include <stdlib.h>
int main() {
void* p;
return posix_memalign(&p, 64, 64);
}" HAVE_POSIX_MEMALIGN)

set_property(TARGET randomx-benchmark PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET randomx-benchmark PROPERTY CXX_STANDARD 11)
2 changes: 1 addition & 1 deletion doc/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ These parameters can be modified in source file [configuration.h](../src/configu
|`RANDOMX_SUPERSCALAR_LATENCY`|Target latency for SuperscalarHash (in cycles of the reference CPU)|`170`|
|`RANDOMX_DATASET_BASE_SIZE`|Dataset base size in bytes|`2147483648`|
|`RANDOMX_DATASET_EXTRA_SIZE`|Dataset extra size in bytes|`33554368`|
|`RANDOMX_PROGRAM_SIZE`|The number of instructions in a RandomX program|`256`|
|`RANDOMX_PROGRAM_SIZE`|The number of instructions in a RandomX program|v1: `256`, v2: `384`|
|`RANDOMX_PROGRAM_ITERATIONS`|The number of iterations per program|`2048`|
|`RANDOMX_PROGRAM_COUNT`|The number of programs per hash|`8`|
|`RANDOMX_JUMP_BITS`|Jump condition mask size in bits|`8`|
Expand Down
11 changes: 9 additions & 2 deletions doc/design.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ Approximate distribution of floating point register values at the end of each pr

The small number of F register values at `1e+14` is caused by the FSCAL instruction, which significantly increases the range of the register values.

Group E registers cover a very large range of values. About 2% of programs produce at least one `infinity` value.
Group E registers cover a very large range of values. About 2% (6.85% for RandomX v2) of programs produce at least one `infinity` value.

To maximize entropy and also to fit into one 64-byte cache line, floating point registers are combined using the XOR operation at the end of each iteration before being stored into the Scratchpad.

Expand Down Expand Up @@ -254,13 +254,18 @@ The Scratchpad is split into 3 levels to mimic the typical CPU cache hierarchy [
|CPU μ-architecture|L1 latency|L2 latency|L3 latency|source|
|----------------|----------|----------|----------|------|
ARM Cortex A55|2|6|-|[[24](https://www.anandtech.com/show/11441/dynamiq-and-arms-new-cpus-cortex-a75-a55/4)]
|AMD Zen+|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
|AMD Zen+, Zen 2|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
|AMD Zen 3/4|4|14|47|[[31](https://agner.org/optimize/microarchitecture.pdf)]|
|AMD Zen 5|4|14|55|[[31](https://agner.org/optimize/microarchitecture.pdf)]|
|Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy)]
|Intel Alder Lake P core|5|15|65|[[31](https://agner.org/optimize/microarchitecture.pdf)]|

The L3 cache is much larger and located further from the CPU core. As a result, its access latencies are much higher and can cause stalls in program execution.

RandomX therefore performs only 2 random accesses into "L3" Scratchpad per program iteration (steps 2 and 3 in chapter 4.6.2 of the Specification). Register values from a given iteration are written into the same locations they were loaded from, which guarantees that the required cache lines have been moved into the faster L1 or L2 caches.

RandomX v2 partially hides the stalls from accessing "L3" Scratchpad by doing an extended mixing of group F and group E registers (step 10 in chapter 4.6.2 of the Specification). This keeps the CPU busy at all times so it doesn't waste energy while waiting for data.

Additionally, integer instructions that read from a fixed address also use the whole "L3" Scratchpad (Table 5.1.4 of the Specification) because repetitive accesses will ensure that the cache line will be placed in the L1 cache of the CPU. This shows that the Scratchpad level doesn't always directly correspond to the same CPU cache level.

#### 2.8.2 Scratchpad writes
Expand Down Expand Up @@ -648,3 +653,5 @@ Cryptocurrencies and Password Hashing - https://eprint.iacr.org/2015/430.pdf Tab
[29] 7-Zip File archiver - https://www.7-zip.org/

[30] TestU01 library - http://simul.iro.umontreal.ca/testu01/tu01.html

[31] Agner Fog, The microarchitecture of Intel, AMD, and VIA CPUs - https://agner.org/optimize/microarchitecture.pdf
111 changes: 111 additions & 0 deletions doc/design_v2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# RandomX v2 changes and their rationale

## 1. CFROUND tweak

In RandomX v1, CFROUND instruction changes the rounding mode on each main loop iteration. Unfortunately, x86 CPUs were not designed for the rounding mode changing that often. As a result, this single instruction costs up to 10% of hashrate on Ryzen CPUs. This is where an ASIC or a specially designed CPU can get an easy advantage.

RandomX v2 reduces the impact massively: CFROUND will now change the rounding mode only every 16th time it executes (on average).

## 2. AES tweak

F and E registers are now mixed together with AES instead of XOR (step 10 in chapter 4.6.2 of the spec).

- AES tweak doubles the amount of AES computations per hash without hurting the hashrate (it uses the gap in RandomX main loop where the CPU was sitting idle, waiting for scratchpad data).
- AES tweak also introduces AES in the main RandomX loop which makes it harder for specialized hardware to get away with just a dedicated circuit for scratchpad intialization - AES must be implemented as a part of RandomX VM and work with RandomX VM's registers.
- AES tweak also improves data entropy (makes it more random) before it's written to the scratchpad.

## 3. Program size increase from 256 to 384

CPUs got much faster since the original RandomX was released. Back in 2019, Ryzen 9 3950X was the fastest desktop CPU for RandomX, and at the time of writing (January 2026) it's Ryzen 9 9950X. In most CPU benchmarks, 9950X is more than 1.5x faster on average - thanks to clock speed increase from < 4 to > 5 GHz, and to IPC improvements.

But in RandomX it's only 20-25% faster, because it's bottlenecked by the RAM latency. While CPU cores got faster over the years, RAM latency stayed basically the same - a tuned DDR4 memory from 2019, and a tuned DDR5 memory from 2026 will both have the same access latency of around 50-55 ns.

This imbalance is the main reason of the program size increase - Zen5 and newer CPUs need more work to keep themselves busy while they're waiting for data from memory.

## 4. Prefetch two main loop iterations ahead instead of just one

RandomX v1 prefetches data from the dataset one iteration ahead. RandomX v2 increases it to two iterations by redefining the prefetch logic (see the `mp` register in specs.md).

This change complements the program size increase tweak and has the same purpose.

## 4. Performance impact

Tests show that RandomX v2, while being more than 1.5 times "heavier" than RandomX v1, results in only a slight hashrate reduction but massive efficiency improvements (in terms of VM+AES instructions per Joule):

### AMD Ryzen 9 9950X (Zen 5) @ 285W (PBO max)
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|27186.1|100.0%|95.38|121.15e9|425.1e6|100.0%|
RandomX v2|26791.7|98.55%|94.01|182.61e9|640.72e6|**150.72%**|

### AMD Ryzen 9 9950X (Zen 5) @ 100W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|19912.2|100.0%|199.122|88.74e9|887.38e6|100.0%|
RandomX v2|17346.2|87.11%|173.462|118.23e9|1182.27e6|**133.23%**|

### AMD Ryzen AI 9 HX 370 (Zen 5), DDR5-5600 @ 28W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|6597.15|100.0%|235.61|29.4e9|1050e6|100.0%|
RandomX v2|7121.69|107.95%|254.35|48.54e9|1733.56e6|**165.1%**|

### Ryzen AI 9 365 (Zen 5) @ 28W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|6091|100.0%|217.5|27.14e9|969.44e6|100.0%|
RandomX v2|6649|109.2%|237.5|45.32e9|1618.5e6|**166.95%**|

### Ryzen 9 7945HX (Zen 4) @ 62W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|16126|100.0%|260.1|71.86e9|1159.11e6|100.0%|
RandomX v2|15308|94.9%|246.9|104.33e9|1682.83e6|**145.18%**|

### Ryzen 5 8600G (Zen 4) @ 45W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|5876.47|100.0%|130.59|26.19e9|581.96e6|100.0%|
RandomX v2|5375.29|91.5%|119.45|36.64e9|814.15e6|**139.9%**|

### Ryzen 9 5950X (Zen 3) @ 122-126W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|14745.9 @ 126W|100.0%|117.03|65.71e9|521.54e6|100.0%|
RandomX v2|12905.3 @ 122W|87.5%|105.78|87.96e9|720.98e6|**138.2%**|

### Ryzen 9 3950X (Zen 2) @ 131W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|15049.34|100.0%|114.88|67.07e9|511.96e6|100.0%|
RandomX v2|13868.64|92.15%|105.87|94.53e9|721.57e6|**140.94%**|

### Ryzen 7 3700X (Zen 2) @ 88W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|8624|100.0%|98|38.43e9|436.73e6|100.0%|
RandomX v2|7361|85.35%|83.65|50.17e9|570.12e6|**130.54%**|

### Ryzen 7 1700X (Zen 1) @ 95W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|4832.73|100.0%|50.87|21.54e9|226.7e6|100.0%|
RandomX v2|4870.41|100.78%|51.27|33.2e9|349.43e6|**154.13%**|

### Intel Core i9-12900K @ 125W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|8644.47|100.0%|69.16|38.52e9|308.19e6|100.0%|
RandomX v2|8310.78|96.14%|66.49|56.64e9|453.15e6|**147.04%**|

### Intel Core i7-8650U @ 15W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|1831.15|100.0%|122.08|8.16e9|544.03e6|100.0%|
RandomX v2|1415|77.27%|94.33|9.64e9|642.95e6|**118.18%**|

### Intel Core i7-6820HQ @ 45W
|Algorithm|Hashrate|Relative speed|Hash/Joule|VM+AES/s|VM+AES/Joule|Relative work/Joule|
|-|-|-|-|-|-|-|
RandomX v1|1968.56|100.0%|43.75|8.77e9|194.95e6|100.0%|
RandomX v2|1488.25|75.6%|33.07|10.14e9|225.41e6|**115.62%**|
22 changes: 16 additions & 6 deletions doc/specs.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,10 @@ To access a particular scratchpad level, bitwise AND with a mask according to ta

The VM has 8 integer registers `r0`-`r7` (group R) and a total of 12 floating point registers split into 3 groups: `f0`-`f3` (group F), `e0`-`e3` (group E) and `a0`-`a3` (group A). Integer registers are 64 bits wide, while floating point registers are 128 bits wide and contain a pair of numbers in floating point format. The lower and upper half of floating point registers are not separately addressable.

Additionally, there are 3 internal registers `ma`, `mx` and `fprc`.
Additionally, there are 4 internal registers `ma`, `mx`, `mt` and `fprc`.

- RandomX v1: `mp` is a name alias for `mx`
- RandomX v2: `mp` is a name alias for `ma`

Integer registers `r0`-`r7` can be the source or the destination operands of integer instructions or may be used as address registers for accessing the Scratchpad.

Expand Down Expand Up @@ -447,12 +450,14 @@ The loop described below is repeated until the value of the `ic` register reache
2. `spAddr0` is used to perform a 64-byte aligned read from Scratchpad level 3 (using mask from Table 4.2.1). The 64 bytes are XORed with all integer registers in order `r0`-`r7`.
3. `spAddr1` is used to perform a 64-byte aligned read from Scratchpad level 3 (using mask from Table 4.2.1). Each floating point register `f0`-`f3` and `e0`-`e3` is initialized using an 8-byte value according to the conversion rules from chapters 4.3.1 and 4.3.2.
4. The 256 instructions stored in the Program Buffer are executed.
5. The `mx` register is XORed with the low 32 bits of registers `readReg2` and `readReg3` (see Table 4.5.3).
6. A 64-byte Dataset item at address `datasetOffset + mx % RANDOMX_DATASET_BASE_SIZE` is prefetched from the Dataset (it will be used during the next iteration).
7. A 64-byte Dataset item at address `datasetOffset + ma % RANDOMX_DATASET_BASE_SIZE` is loaded from the Dataset. The 64 bytes are XORed with all integer registers in order `r0`-`r7`.
5. The value of `ma` is saved in `mt`. Then the `mp` register is XORed with the low 32 bits of registers `readReg2` and `readReg3` (see Table 4.5.3).
6. A 64-byte Dataset item at address `datasetOffset + mp % RANDOMX_DATASET_BASE_SIZE` is prefetched from the Dataset (it will be used during the next iteration(s)).
7. A 64-byte Dataset item at address `datasetOffset + mt % RANDOMX_DATASET_BASE_SIZE` is loaded from the Dataset. The 64 bytes are XORed with all integer registers in order `r0`-`r7`.
8. The values of registers `mx` and `ma` are swapped.
9. The values of all integer registers `r0`-`r7` are written to the Scratchpad (L3) at address `spAddr1` (64-byte aligned).
10. Register `f0` is XORed with register `e0` and the result is stored in register `f0`. Register `f1` is XORed with register `e1` and the result is stored in register `f1`. Register `f2` is XORed with register `e2` and the result is stored in register `f2`. Register `f3` is XORed with register `e3` and the result is stored in register `f3`.
10. Group F registers are mixed with group E registers.
- **RandomX v1:** `fi = fi XOR ei` for i = 0,1,2,3
- **RandomX v2:** `f0 = AES encrypt of f0 with e0 as key`,`f1 = AES decrypt of f1 with e0 as key`,`f2 = AES encrypt of f2 with e0 as key`,`f3 = AES decrypt of f3 with e0 as key`. These steps are repeated with `e1`,`e2`,`e3` as keys.
11. The values of registers `f0`-`f3` are written to the Scratchpad (L3) at address `spAddr0` (64-byte aligned).
12. `spAddr0` and `spAddr1` are both set to zero.
13. `ic` is decreased by 1.
Expand Down Expand Up @@ -667,7 +672,12 @@ There are 2 control instructions.
|25/256|CBRANCH|R|-|`dst = dst + cimm`, conditional jump

#### 5.4.1 CFROUND
This instruction calculates a 2-bit value by rotating the source register right by `imm32` bits and taking the 2 least significant bits (the value of the source register is unaffected). The result is stored in the `fprc` register. This changes the rounding mode of all subsequent floating point instructions.
This instruction calculates a 2-bit value by rotating the source register right by `imm32` bits and taking the 2 least significant bits (the value of the source register is unaffected).

- **RandomX v1**: bits 0-1 of the result are stored in the `fprc` register.
- **RandomX v2**: if bits 2-5 of the result are 0, bits 0-1 of the result are stored in the `fprc` register

This sets the rounding mode of all subsequent floating point instructions.

#### 5.4.2 CBRANCH

Expand Down
8 changes: 4 additions & 4 deletions src/aes_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
assert(inputSize % 64 == 0);

#ifdef __riscv
if (randomx::cpu.hasAes()) {
if (!softAes) {
hashAes1Rx4_zvkned(input, inputSize, hash);
return;
}
Expand Down Expand Up @@ -153,7 +153,7 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
assert(outputSize % 64 == 0);

#ifdef __riscv
if (randomx::cpu.hasAes()) {
if (!softAes) {
fillAes1Rx4_zvkned(state, outputSize, buffer);
return;
}
Expand Down Expand Up @@ -221,7 +221,7 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
assert(outputSize % 64 == 0);

#ifdef __riscv
if (randomx::cpu.hasAes()) {
if (!softAes) {
fillAes4Rx4_zvkned(state, outputSize, buffer);
return;
}
Expand Down Expand Up @@ -288,7 +288,7 @@ template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
#ifdef __riscv
if (randomx::cpu.hasAes()) {
if (!softAes) {
hashAndFillAes1Rx4_zvkned(scratchpad, scratchpadSize, hash, fill_state);
return;
}
Expand Down
13 changes: 13 additions & 0 deletions src/aes_hash_rv64_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,19 @@ static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 12, 8, 4, 0, 76, 72, 68, 64 };

#define lutEnc0 randomx_aes_lut_enc[0]
#define lutEnc1 randomx_aes_lut_enc[1]
#define lutEnc2 randomx_aes_lut_enc[2]
#define lutEnc3 randomx_aes_lut_enc[3]

#define lutDec0 randomx_aes_lut_dec[0]
#define lutDec1 randomx_aes_lut_dec[1]
#define lutDec2 randomx_aes_lut_dec[2]
#define lutDec3 randomx_aes_lut_dec[3]

#define lutEncIndex randomx_aes_lut_enc_index
#define lutDecIndex randomx_aes_lut_dec_index

void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) {
const uint8_t* inptr = (const uint8_t*)input;
const uint8_t* inputEnd = inptr + inputSize;
Expand Down
4 changes: 3 additions & 1 deletion src/asm/configuration.asm
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ RANDOMX_CACHE_ACCESSES EQU 8t
RANDOMX_SUPERSCALAR_LATENCY EQU 170t
RANDOMX_DATASET_BASE_SIZE EQU 2147483648t
RANDOMX_DATASET_EXTRA_SIZE EQU 33554368t
RANDOMX_PROGRAM_SIZE EQU 256t
RANDOMX_PROGRAM_SIZE_V1 EQU 256t
RANDOMX_PROGRAM_SIZE_V2 EQU 384t
RANDOMX_PROGRAM_MAX_SIZE EQU 384t
RANDOMX_PROGRAM_ITERATIONS EQU 2048t
RANDOMX_PROGRAM_COUNT EQU 8t
RANDOMX_SCRATCHPAD_L3 EQU 2097152t
Expand Down
16 changes: 9 additions & 7 deletions src/asm/program_epilogue_linux.inc
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
;# restore callee-saved registers - System V AMD64 ABI
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
mov r15, qword ptr [rsp+280]
mov r14, qword ptr [rsp+272]
mov r13, qword ptr [rsp+264]
mov r12, qword ptr [rsp+256]
mov rbp, qword ptr [rsp+232]
mov rbx, qword ptr [rsp+224]

add rsp, 456

;# program finished
ret 0
ret
2 changes: 1 addition & 1 deletion src/asm/program_epilogue_store.inc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
;# save VM register values
pop rcx
mov rcx, qword ptr [rsp+448]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
Expand Down
40 changes: 20 additions & 20 deletions src/asm/program_epilogue_win64.inc
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
;# restore callee-saved registers - Microsoft x64 calling convention
movdqu xmm15, xmmword ptr [rsp]
movdqu xmm14, xmmword ptr [rsp+16]
movdqu xmm13, xmmword ptr [rsp+32]
movdqu xmm12, xmmword ptr [rsp+48]
movdqu xmm11, xmmword ptr [rsp+64]
add rsp, 80
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
movdqa xmm15, xmmword ptr [rsp+432]
movdqa xmm14, xmmword ptr [rsp+416]
movdqa xmm13, xmmword ptr [rsp+400]
movdqa xmm12, xmmword ptr [rsp+384]
movdqa xmm11, xmmword ptr [rsp+368]
movdqa xmm10, xmmword ptr [rsp+352]
movdqa xmm9, xmmword ptr [rsp+336]
movdqa xmm8, xmmword ptr [rsp+320]
movdqa xmm7, xmmword ptr [rsp+304]
movdqa xmm6, xmmword ptr [rsp+288]
mov r15, qword ptr [rsp+280]
mov r14, qword ptr [rsp+272]
mov r13, qword ptr [rsp+264]
mov r12, qword ptr [rsp+256]
mov rdi, qword ptr [rsp+248]
mov rsi, qword ptr [rsp+240]
mov rbp, qword ptr [rsp+232]
mov rbx, qword ptr [rsp+224]

add rsp, 456

;# program finished
ret
Loading