Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 14 additions & 24 deletions constantine/math/arithmetic/assembly/limbs_asm_mul_mont_arm64.nim
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,18 @@ macro mulMont_CIOS_sparebit_gen[N: static int](

aaSym = ident"aa"
aa = asmArray(aaSym, N, ElemsInReg, asmInputOutput) # used as buffer for final substraction
mSym = ident"m"
m = asmValue(mSym, Reg, asmOutputEarlyClobber)

uSym = ident"u"
vSym = ident"v"

var # Break dependencies chain
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
# Note: We might want to use an extra register to break dependency chains and expose more ILP
# but then we run into GCC limitations https://github.com/mratsim/constantine/issues/582
var u = asmValue(uSym, Reg, asmOutputEarlyClobber)

# Prologue
result.add quote do:
var `tSym`{.noinit, used.}: typeof(`r_PIR`)
var `aSym`{.noinit.}, `biSym`{.noInit.}, `mSym`{.noinit.}: BaseType
var `uSym`{.noinit.}, `vSym`{.noInit.}: BaseType
var `aSym`{.noinit.}, `biSym`{.noInit.}: BaseType
var `uSym`{.noinit.}: BaseType

let `aaSym` {.noinit, used.} = `a_PIR`

Expand Down Expand Up @@ -111,24 +108,19 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
template mulloadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.mul u, lhs, rhs
ctx.adds dst, addend, u
swap(u, v)
template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.mul u, lhs, rhs
ctx.adcs dst, addend, u
swap(u, v)

template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh u, lhs, rhs
ctx.adds dst, addend, u
swap(u, v)
template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh u, lhs, rhs
ctx.adcs dst, addend, u
swap(u, v)
template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh u, lhs, rhs
ctx.adc dst, addend, u
swap(u, v)

doAssert N >= 2

Expand Down Expand Up @@ -200,11 +192,14 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
# t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
# t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
# t[3] = A + carry + (m*M[3]).hi

# Note: we might lose some cycles per iteration if we reuse bi here compared to perfect usage of ILP.
# but GCC limitation https://github.com/mratsim/constantine/issues/582
template m: untyped = bi

ctx.mul m, t[0], m0ninv
ctx.mul u, m, M[0]
ctx.cmn t[0], u # TODO: bad latency chain, hopefully done parallel to prev loop
swap(u, v)

for j in 1 ..< N:
ctx.mulloadd_cio(t[j-1], m, M[j], t[j])
Expand Down Expand Up @@ -298,34 +293,29 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
tN = scratch[2] # High part of extended precision multiplication
A = scratch[3] # Carry during mul step (A)

# Same slot to save registers
bi = scratch[4] # Stores b[i] during mul and u during reduction
m = scratch[5] # Red step: (t[0] * m0ninv) mod 2ʷ
m = scratch[4] # Red step: (t[0] * m0ninv) mod 2ʷ
Comment on lines 298 to +299

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The comment for bi on line 298 appears to be outdated. It mentions that scratch[4] stores u during reduction, but u is now assigned to scratch[5]. Since scratch[4] is now reused for m, I suggest updating the comment for bi to reflect its actual usage and avoid potential confusion.

    bi = scratch[4]                                  # Stores b[i] during mul
    m = scratch[4]                                   # Red step: (t[0] * m0ninv) mod 2ʷ


var # break dependency chains
u = scratch[6]
v = scratch[7]
var u = scratch[5]

template mulloadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.mul u, lhs, rhs
ctx.adds dst, addend, u
swap(u, v)
template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.mul u, lhs, rhs
ctx.adcs dst, addend, u
swap(u, v)

template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh u, lhs, rhs
ctx.adds dst, addend, u
swap(u, v)
template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh u, lhs, rhs
ctx.adcs dst, addend, u
swap(u, v)
template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh u, lhs, rhs
ctx.adc dst, addend, u
swap(u, v)

result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
Expand Down Expand Up @@ -392,12 +382,12 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](

# Reduction step
# -------------------------------
# bi and m are aliasing
ctx.comment " Reduction step"

ctx.mul m, t[0], m0ninv
ctx.mul u, m, M[0]
ctx.cmn t[0], u # TODO: bad latency chain, hopefully done parallel to prev loop
swap(u, v)

for j in 1 ..< N:
ctx.mulloadd_cio(t[j-1], m, M[j], t[j])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,25 +65,21 @@ macro redc2xMont_gen[N: static int](
let m0ninv = v[1]
let m = v[2]
var t0 = v[3]
var t1 = v[4]
# var t1 = v[4] # We might lose some cycles compared to perfect ILP but GCC limitation https://github.com/mratsim/constantine/issues/582

template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.mul t0, lhs, rhs
ctx.adcs dst, addend, t0
swap(t0, t1)

template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh t0, lhs, rhs
ctx.adds dst, addend, t0
swap(t0, t1)
template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh t0, lhs, rhs
ctx.adcs dst, addend, t0
swap(t0, t1)
template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
ctx.umulh t0, lhs, rhs
ctx.adc dst, addend, t0
swap(t0, t1)

# Algorithm
# ---------------------------------------------------------
Expand All @@ -109,7 +105,6 @@ macro redc2xMont_gen[N: static int](
ctx.comment "---- Reduction " & $i
ctx.mul t0, m, M[0]
ctx.cmn u[0], t0
swap(t0, t1)
ctx.mov u[N], xzr

for j in 0 ..< N:
Expand All @@ -136,7 +131,6 @@ macro redc2xMont_gen[N: static int](
ctx.adc u[i], u[i], t0
else:
ctx.adcs u[i], u[i], t0
swap(t0, t1)

if spareBits >= 2 and lazyReduce:
for i in 0 ..< N:
Expand Down
Loading