aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHaines Sy <hainesy@google.com>2024-05-21 05:56:08 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2024-05-21 05:56:08 +0000
commit049ff35b81a20a8e4e8a1b1cad5b91eef8a9a0c5 (patch)
treeb6a919ce1bb3cbcb45849deb171bc8e5e294678d
parente729547f155487b6138d0816c845e042cd19248c (diff)
parent654f414238a34c178d18a79c13e72e92a1f68db1 (diff)
downloadbinary_translation-main.tar.gz
Merge "Implement vfw[n]m acc/sac as intrinsics" into mainHEADmastermain
-rw-r--r--decoder/include/berberis/decoder/riscv64/decoder.h8
-rw-r--r--interpreter/riscv64/interpreter.h56
-rw-r--r--interpreter/riscv64/interpreter_test.cc246
-rw-r--r--intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h43
4 files changed, 332 insertions, 21 deletions
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h
index 118717c4..63ff277a 100644
--- a/decoder/include/berberis/decoder/riscv64/decoder.h
+++ b/decoder/include/berberis/decoder/riscv64/decoder.h
@@ -277,8 +277,8 @@ class Decoder {
kVfwsubwf = 0b110110,
kVfwmulvf = 0b111000,
kVfwmaccvf = 0b111100,
- kVfwnmaccvf = 0b111100,
- kVfwmsacvf = 0b111100,
+ kVfwnmaccvf = 0b111101,
+ kVfwmsacvf = 0b111110,
kVfwnmsacvf = 0b111111,
};
@@ -319,8 +319,8 @@ class Decoder {
kVfwsubwv = 0b110110,
kVfwmulvv = 0b111000,
kVfwmaccvv = 0b111100,
- kVfwnmaccvv = 0b111100,
- kVfwmsacvv = 0b111100,
+ kVfwnmaccvv = 0b111101,
+ kVfwmsacvv = 0b111110,
kVfwnmsacvv = 0b111111,
};
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
index 243151dd..91f51f17 100644
--- a/interpreter/riscv64/interpreter.h
+++ b/interpreter/riscv64/interpreter.h
@@ -1301,6 +1301,34 @@ class Interpreter {
vta,
vma,
kFrm>(args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfwmaccvf:
+ return OpVectorWidenvxw<intrinsics::Vfwmaccvf<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfwnmaccvf:
+ return OpVectorWidenvxw<intrinsics::Vfwnmaccvf<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfwmsacvf:
+ return OpVectorWidenvxw<intrinsics::Vfwmsacvf<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfwnmsacvf:
+ return OpVectorWidenvxw<intrinsics::Vfwnmsacvf<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, arg2);
default:
break;
}
@@ -1562,6 +1590,34 @@ class Interpreter {
vta,
vma,
kFrm>(args.dst, args.src1, args.src2);
+ case Decoder::VOpFVvOpcode::kVfwmaccvv:
+ return OpVectorWidenvvw<intrinsics::Vfwmaccvv<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, args.src2);
+ case Decoder::VOpFVvOpcode::kVfwnmaccvv:
+ return OpVectorWidenvvw<intrinsics::Vfwnmaccvv<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, args.src2);
+ case Decoder::VOpFVvOpcode::kVfwmsacvv:
+ return OpVectorWidenvvw<intrinsics::Vfwmsacvv<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, args.src2);
+ case Decoder::VOpFVvOpcode::kVfwnmsacvv:
+ return OpVectorWidenvvw<intrinsics::Vfwnmsacvv<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, args.src2);
case Decoder::VOpFVvOpcode::kVFUnary0:
switch (args.vfunary0_opcode) {
case Decoder::VFUnary0Opcode::kVfwcvtxufv:
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index a6a06192..b818b667 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -1170,9 +1170,11 @@ class Riscv64InterpreterTest : public ::testing::Test {
void TestWideningVectorFloatInstruction(uint32_t insn_bytes,
const uint64_t (&expected_result_int64)[8][2],
- const __v2du (&source)[16]) {
- TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kWidening>(
- insn_bytes, source, expected_result_int64);
+ const __v2du (&source)[16],
+ __m128i dst_result = kUndisturbedResult) {
+ TestVectorInstructionInternal<TestVectorInstructionKind::kFloat,
+ TestVectorInstructionMode::kWidening>(
+ insn_bytes, dst_result, source, expected_result_int64);
}
void TestWideningVectorFloatInstruction(uint32_t insn_bytes,
@@ -1205,11 +1207,25 @@ class Riscv64InterpreterTest : public ::testing::Test {
uint32_t insn_bytes,
const __v2du (&source)[16],
const ElementType (&... expected_result)[kResultsCount][kElementCount]) {
- auto Verify = [this, &source](uint32_t insn_bytes,
- uint8_t vsew,
- uint8_t vlmul_max,
- const auto& expected_result,
- auto mask) {
+ TestVectorInstructionInternal<kTestVectorInstructionKind, kTestVectorInstructionMode>(
+ insn_bytes, kUndisturbedResult, source, expected_result...);
+ }
+
+ template <TestVectorInstructionKind kTestVectorInstructionKind,
+ TestVectorInstructionMode kTestVectorInstructionMode,
+ typename... ElementType,
+ size_t... kResultsCount,
+ size_t... kElementCount>
+ void TestVectorInstructionInternal(
+ uint32_t insn_bytes,
+ __m128i dst_result,
+ const __v2du (&source)[16],
+ const ElementType (&... expected_result)[kResultsCount][kElementCount]) {
+ auto Verify = [this, &source, dst_result](uint32_t insn_bytes,
+ uint8_t vsew,
+ uint8_t vlmul_max,
+ const auto& expected_result,
+ auto mask) {
// Mask register is, unconditionally, v0, and we need 8, 16, or 24 to handle full 8-registers
// inputs thus we use v8..v15 for destination and place sources into v16..v23 and v24..v31.
state_.cpu.v[0] = SIMD128Register{kMask}.Get<__uint128_t>();
@@ -1268,7 +1284,7 @@ class Riscv64InterpreterTest : public ::testing::Test {
// Set expected_result vector registers into 0b01010101… pattern.
for (size_t index = 0; index < 8; ++index) {
- state_.cpu.v[8 + index] = SIMD128Register{kUndisturbedResult}.Get<__uint128_t>();
+ state_.cpu.v[8 + index] = SIMD128Register{dst_result}.Get<__uint128_t>();
}
state_.cpu.insn_addr = ToGuestAddr(&insn_bytes);
@@ -1282,14 +1298,14 @@ class Riscv64InterpreterTest : public ::testing::Test {
std::copy_n(source, 8, expected_inactive);
} else {
// For most instructions, follow basic inactive processing rules based on vma flag.
- std::fill_n(expected_inactive, 8, (vma ? kAgnosticResult : kUndisturbedResult));
+ std::fill_n(expected_inactive, 8, (vma ? kAgnosticResult : dst_result));
}
if (emul < 4) {
for (size_t index = 0; index < 1 << emul; ++index) {
if (index == 0 && emul == 2) {
EXPECT_EQ(state_.cpu.v[8 + index],
- ((kUndisturbedResult & kFractionMaskInt8[3]) |
+ ((dst_result & kFractionMaskInt8[3]) |
(SIMD128Register{expected_result[index]} & mask[index] &
~kFractionMaskInt8[3]) |
(expected_inactive[index] & ~mask[index] & ~kFractionMaskInt8[3]))
@@ -1299,12 +1315,12 @@ class Riscv64InterpreterTest : public ::testing::Test {
((SIMD128Register{expected_result[index]} & mask[index] &
kFractionMaskInt8[3]) |
(expected_inactive[index] & ~mask[index] & kFractionMaskInt8[3]) |
- ((vta ? kAgnosticResult : kUndisturbedResult) & ~kFractionMaskInt8[3]))
+ ((vta ? kAgnosticResult : dst_result) & ~kFractionMaskInt8[3]))
.template Get<__uint128_t>());
} else if (index == 3 && emul == 2 && vta) {
EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{kAgnosticResult});
} else if (index == 3 && emul == 2) {
- EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{kUndisturbedResult});
+ EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{dst_result});
} else {
EXPECT_EQ(state_.cpu.v[8 + index],
((SIMD128Register{expected_result[index]} & mask[index]) |
@@ -1317,7 +1333,7 @@ class Riscv64InterpreterTest : public ::testing::Test {
state_.cpu.v[8],
((SIMD128Register{expected_result[0]} & mask[0] & kFractionMaskInt8[emul - 4]) |
(expected_inactive[0] & ~mask[0] & kFractionMaskInt8[emul - 4]) |
- ((vta ? kAgnosticResult : kUndisturbedResult) & ~kFractionMaskInt8[emul - 4]))
+ ((vta ? kAgnosticResult : dst_result) & ~kFractionMaskInt8[emul - 4]))
.template Get<__uint128_t>());
}
@@ -9758,6 +9774,206 @@ TEST_F(Riscv64InterpreterTest, TestVfnmacc) {
kVectorCalculationsSource);
}
+TEST_F(Riscv64InterpreterTest, TestVfwmacc) {
+ __m128i dst_result = {0x0000'0000'0000'0000, 0x0000'0000'0000'0000};
+ TestWideningVectorFloatInstruction(0xf1881457, // vfwmacc.vv v8, v16, v24, v0.t
+ {{0x3330'e53c'6480'0000, 0x34b2'786b'bbc5'4900},
+ {0x3234'1766'da4a'6200, 0x33b5'cab6'2d6c'4800},
+ {0x3937'92ba'5bd0'8000, 0x3ab9'666a'779a'0d00},
+ {0x383b'4565'd61f'6600, 0x39bd'3935'e5bd'8800},
+ {0x3f3f'423b'5522'0000, 0x40c0'ab36'1ab7'e880},
+ {0x3e41'bab3'e9fa'b500, 0x3fc2'd4dc'5007'e400},
+ {0x4543'f9df'a83a'4000, 0x46c5'2438'7aa3'4a80},
+ {0x4446'53b6'69e6'3700, 0x45c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xf100d457, // vfwmacc.vf v8, f1, v16, v0.t
+ {{0xb886'f0ad'0000'0000, 0xb907'a561'b400'0000},
+ {0xb988'5a16'6800'0000, 0xba09'0ecb'1c00'0000},
+ {0xba89'c37f'd000'0000, 0xbb0a'7834'8400'0000},
+ {0xbb8b'2ce9'3800'0000, 0xbc0b'e19d'ec00'0000},
+ {0xbc8c'9652'a000'0000, 0xbd0d'4b07'5400'0000},
+ {0xbd8d'ffbc'0800'0000, 0xbe0e'b470'bc00'0000},
+ {0xbe8f'6925'7000'0000, 0xbf10'0eed'1200'0000},
+ {0xbf90'6947'6c00'0000, 0xc010'c3a1'c600'0000}},
+ kVectorCalculationsSource,
+ dst_result);
+
+ dst_result = {0x401c'6666'6666'6666, 0x401c'6666'6666'6666};
+ TestWideningVectorFloatInstruction(0xf1881457, // vfwmacc.vv v8, v16, v24, v0.t
+ {{0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'66e3'6f53'baee, 0x40c0'aec2'e784'b54d},
+ {0x401c'6666'66f4'3c05, 0x401c'fd0d'48e6'a586},
+ {0x4543'f9df'a83a'4000, 0x46c5'2438'7aa3'4a80},
+ {0x4446'53b6'69e6'3700, 0x45c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xf100d457, // vfwmacc.vf v8, f1, v16, v0.t
+ {{0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6657},
+ {0x401c'6666'6666'5766, 0x401c'6666'6657'0c2e},
+ {0x401c'6666'56b1'd3ae, 0x401c'6656'5779'5466},
+ {0x401c'55fd'1efa'6666, 0x4007'4589'40cc'cccc}},
+ kVectorCalculationsSource,
+ dst_result);
+}
+
+TEST_F(Riscv64InterpreterTest, TestVfwnmacc) {
+ __m128i dst_result = {0x0000'0000'0000'0000, 0x0000'0000'0000'0000};
+ TestWideningVectorFloatInstruction(0xf5881457, // vfwnmacc.vv v8, v16, v24, v0.t
+ {{0xb330'e53c'6480'0000, 0xb4b2'786b'bbc5'4900},
+ {0xb234'1766'da4a'6200, 0xb3b5'cab6'2d6c'4800},
+ {0xb937'92ba'5bd0'8000, 0xbab9'666a'779a'0d00},
+ {0xb83b'4565'd61f'6600, 0xb9bd'3935'e5bd'8800},
+ {0xbf3f'423b'5522'0000, 0xc0c0'ab36'1ab7'e880},
+ {0xbe41'bab3'e9fa'b500, 0xbfc2'd4dc'5007'e400},
+ {0xc543'f9df'a83a'4000, 0xc6c5'2438'7aa3'4a80},
+ {0xc446'53b6'69e6'3700, 0xc5c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xf500d457, // vfwnmacc.vf v8, f1, v16, v0.t
+ {{0x3886'f0ad'0000'0000, 0x3907'a561'b400'0000},
+ {0x3988'5a16'6800'0000, 0x3a09'0ecb'1c00'0000},
+ {0x3a89'c37f'd000'0000, 0x3b0a'7834'8400'0000},
+ {0x3b8b'2ce9'3800'0000, 0x3c0b'e19d'ec00'0000},
+ {0x3c8c'9652'a000'0000, 0x3d0d'4b07'5400'0000},
+ {0x3d8d'ffbc'0800'0000, 0x3e0e'b470'bc00'0000},
+ {0x3e8f'6925'7000'0000, 0x3f10'0eed'1200'0000},
+ {0x3f90'6947'6c00'0000, 0x4010'c3a1'c600'0000}},
+ kVectorCalculationsSource,
+ dst_result);
+
+ dst_result = {0x401c'6666'6666'6666, 0x401c'6666'6666'6666};
+ TestWideningVectorFloatInstruction(0xf5881457, // vfwnmacc.vv v8, v16, v24, v0.t
+ {{0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'66e3'6f53'baee, 0xc0c0'aec2'e784'b54d},
+ {0xc01c'6666'66f4'3c05, 0xc01c'fd0d'48e6'a586},
+ {0xc543'f9df'a83a'4000, 0xc6c5'2438'7aa3'4a80},
+ {0xc446'53b6'69e6'3700, 0xc5c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xf500d457, // vfwnmacc.vf v8, f1, v16, v0.t
+ {{0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6657},
+ {0xc01c'6666'6666'5766, 0xc01c'6666'6657'0c2e},
+ {0xc01c'6666'56b1'd3ae, 0xc01c'6656'5779'5466},
+ {0xc01c'55fd'1efa'6666, 0xc007'4589'40cc'cccc}},
+ kVectorCalculationsSource,
+ dst_result);
+}
+
+TEST_F(Riscv64InterpreterTest, TestVfwmsac) {
+ __m128i dst_result = {0x0000'0000'0000'0000, 0x0000'0000'0000'0000};
+ TestWideningVectorFloatInstruction(0xf9881457, // vfwmsac.vv v8, v16, v24, v0.t
+ {{0x3330'e53c'6480'0000, 0x34b2'786b'bbc5'4900},
+ {0x3234'1766'da4a'6200, 0x33b5'cab6'2d6c'4800},
+ {0x3937'92ba'5bd0'8000, 0x3ab9'666a'779a'0d00},
+ {0x383b'4565'd61f'6600, 0x39bd'3935'e5bd'8800},
+ {0x3f3f'423b'5522'0000, 0x40c0'ab36'1ab7'e880},
+ {0x3e41'bab3'e9fa'b500, 0x3fc2'd4dc'5007'e400},
+ {0x4543'f9df'a83a'4000, 0x46c5'2438'7aa3'4a80},
+ {0x4446'53b6'69e6'3700, 0x45c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xf900d457, // vfwmsac.vf v8, f1, v16, v0.t
+ {{0xb886'f0ad'0000'0000, 0xb907'a561'b400'0000},
+ {0xb988'5a16'6800'0000, 0xba09'0ecb'1c00'0000},
+ {0xba89'c37f'd000'0000, 0xbb0a'7834'8400'0000},
+ {0xbb8b'2ce9'3800'0000, 0xbc0b'e19d'ec00'0000},
+ {0xbc8c'9652'a000'0000, 0xbd0d'4b07'5400'0000},
+ {0xbd8d'ffbc'0800'0000, 0xbe0e'b470'bc00'0000},
+ {0xbe8f'6925'7000'0000, 0xbf10'0eed'1200'0000},
+ {0xbf90'6947'6c00'0000, 0xc010'c3a1'c600'0000}},
+ kVectorCalculationsSource,
+ dst_result);
+
+ dst_result = {0x401c'6666'6666'6666, 0x401c'6666'6666'6666};
+ TestWideningVectorFloatInstruction(0xf9881457, // vfwmsac.vv v8, v16, v24, v0.t
+ {{0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'65e9'5d79'11de, 0x40c0'a7a9'4deb'1bb3},
+ {0xc01c'6666'65d8'90c7, 0xc01b'cfbf'83e6'2746},
+ {0x4543'f9df'a83a'4000, 0x46c5'2438'7aa3'4a80},
+ {0x4446'53b6'69e6'3700, 0x45c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xf900d457, // vfwmsac.vf v8, f1, v16, v0.t
+ {{0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6666},
+ {0xc01c'6666'6666'6666, 0xc01c'6666'6666'6675},
+ {0xc01c'6666'6666'7566, 0xc01c'6666'6675'c09e},
+ {0xc01c'6666'761a'f91e, 0xc01c'6676'7553'7866},
+ {0xc01c'76cf'add2'6666, 0xc026'9504'1633'3333}},
+ kVectorCalculationsSource,
+ dst_result);
+}
+
+TEST_F(Riscv64InterpreterTest, TestVfwnmsac) {
+ __m128i dst_result = {0x0000'0000'0000'0000, 0x0000'0000'0000'0000};
+ TestWideningVectorFloatInstruction(0xfd881457, // vfwnmsac.vv v8, v16, v24, v0.t
+ {{0xb330'e53c'6480'0000, 0xb4b2'786b'bbc5'4900},
+ {0xb234'1766'da4a'6200, 0xb3b5'cab6'2d6c'4800},
+ {0xb937'92ba'5bd0'8000, 0xbab9'666a'779a'0d00},
+ {0xb83b'4565'd61f'6600, 0xb9bd'3935'e5bd'8800},
+ {0xbf3f'423b'5522'0000, 0xc0c0'ab36'1ab7'e880},
+ {0xbe41'bab3'e9fa'b500, 0xbfc2'd4dc'5007'e400},
+ {0xc543'f9df'a83a'4000, 0xc6c5'2438'7aa3'4a80},
+ {0xc446'53b6'69e6'3700, 0xc5c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xfd00d457, // vfwnmsac.vf v8, f1, v16, v0.t
+ {{0x3886'f0ad'0000'0000, 0x3907'a561'b400'0000},
+ {0x3988'5a16'6800'0000, 0x3a09'0ecb'1c00'0000},
+ {0x3a89'c37f'd000'0000, 0x3b0a'7834'8400'0000},
+ {0x3b8b'2ce9'3800'0000, 0x3c0b'e19d'ec00'0000},
+ {0x3c8c'9652'a000'0000, 0x3d0d'4b07'5400'0000},
+ {0x3d8d'ffbc'0800'0000, 0x3e0e'b470'bc00'0000},
+ {0x3e8f'6925'7000'0000, 0x3f10'0eed'1200'0000},
+ {0x3f90'6947'6c00'0000, 0x4010'c3a1'c600'0000}},
+ kVectorCalculationsSource,
+ dst_result);
+
+ dst_result = {0x401c'6666'6666'6666, 0x401c'6666'6666'6666};
+ TestWideningVectorFloatInstruction(0xfd881457, // vfwnmsac.vv v8, v16, v24, v0.t
+ {{0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'65e9'5d79'11de, 0xc0c0'a7a9'4deb'1bb3},
+ {0x401c'6666'65d8'90c7, 0x401b'cfbf'83e6'2746},
+ {0xc543'f9df'a83a'4000, 0xc6c5'2438'7aa3'4a80},
+ {0xc446'53b6'69e6'3700, 0xc5c7'8e1f'2e31'8400}},
+ kVectorCalculationsSource,
+ dst_result);
+ TestWideningVectorFloatInstruction(0xfd00d457, // vfwnmsac.vf v8, f1, v16, v0.t
+ {{0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6666},
+ {0x401c'6666'6666'6666, 0x401c'6666'6666'6675},
+ {0x401c'6666'6666'7566, 0x401c'6666'6675'c09e},
+ {0x401c'6666'761a'f91e, 0x401c'6676'7553'7866},
+ {0x401c'76cf'add2'6666, 0x4026'9504'1633'3333}},
+ kVectorCalculationsSource,
+ dst_result);
+}
+
TEST_F(Riscv64InterpreterTest, TestVfmsac) {
TestVectorFloatInstruction(0xb9881457, // vfmsac.vv v8, v16, v24, v0.t
{{0xd555'5555, 0xd555'5555, 0xd555'5555, 0xd555'5555},
@@ -9815,7 +10031,6 @@ TEST_F(Riscv64InterpreterTest, TestVfnmsac) {
{0x5614'25da'f5a1'f73b, 0x5555'5555'5555'5555},
{0x7ff0'0000'0000'0000, 0x7e5b'5815'60f1'ac51},
{0x7ff0'0000'0000'0000, 0x7ff0'0000'0000'0000}},
-
kVectorCalculationsSource);
TestVectorFloatInstruction(0xbd00d457, // vfnmsac.vf v8, f1, v16, v0.t
{{0x5555'5555, 0x5555'5555, 0x5555'5555, 0x5555'5555},
@@ -9977,7 +10192,6 @@ TEST_F(Riscv64InterpreterTest, TestVfnmsub) {
{0x6cbe'71c6'6f19'1715, 0x74c4'9393'3ce7'3b90},
{0x7cce'8731'2f2e'81d5, 0x7ff0'0000'0000'0000},
{0x7ff0'0000'0000'0000, 0x7ff0'0000'0000'0000}},
-
kVectorCalculationsSource);
TestVectorFloatInstruction(0xad00d457, // vfnmsub.vf v8, f1, v16, v0.t
diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
index 0094b17e..02c8da5d 100644
--- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
+++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
@@ -885,6 +885,22 @@ std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType
Vw##name##vx, Widenvvw, return ({ __VA_ARGS__; }); \
, (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
+#define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...) \
+ DEFINE_W_ARITHMETIC_INTRINSIC( \
+ Vfw##name##vv, Widenvvw, return ({ __VA_ARGS__; }); \
+ , \
+ (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
+ (csr), \
+ (src1, src2, src3))
+
+#define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
+ DEFINE_W_ARITHMETIC_INTRINSIC( \
+ Vfw##name##vf, Widenvvw, return ({ __VA_ARGS__; }); \
+ , \
+ (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
+ (csr), \
+ (src1, src2, src3))
+
#define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
, (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
@@ -1189,7 +1205,30 @@ DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccsu, auto [arg1, arg2, arg3] = std:
DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccus, auto [arg1, arg2, arg3] = std::tuple{args...};
(std::get<0>(WideMultiplySignedUnsigned(arg1, arg2))) +
arg3)
-
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
+ macc, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
+ macc, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
+ nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
+ nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
+ msac, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
+ msac, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
+ nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
+DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
+ nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
+ std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{args...};
(arg1 >> arg2))
DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...};
@@ -1232,6 +1271,8 @@ DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(
#undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX
#undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX
#undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW
+#undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW
+#undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW
} // namespace berberis::intrinsics