siyul-park · siyul-park · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/asm/arm64/encoder.go b/asm/arm64/encoder.go
@@ -189,6 +189,8 @@ var floatBinaryOpcodes = map[Op]struct{ single, double uint32 }{
 	OpFSUB: {0x1E203800, 0x1E603800},
 	OpFMUL: {0x1E200800, 0x1E600800},
 	OpFDIV: {0x1E201800, 0x1E601800},
+	OpFMIN: {0x1E205800, 0x1E605800},
+	OpFMAX: {0x1E204800, 0x1E604800},
 }
 
 // floatTernaryOpcodes maps each 4-register scalar float opcode (FMADD-family)
@@ -585,10 +587,25 @@ func (e *Encoder) Encode(inst asm.Instruction) ([]byte, error) {
 		// Float — arithmetic (double precision)
 		// -----------------------------------------------------------------------
 
-	case OpFADD, OpFSUB, OpFMUL, OpFDIV:
+	case OpFADD, OpFSUB, OpFMUL, OpFDIV, OpFMIN, OpFMAX:
 		fb := floatBinaryOpcodes[op]
 		return e.encodeFloatBinary(fb.single, fb.double, inst)
 
+	// -----------------------------------------------------------------------
+	// SIMD (fixed 8B arrangement): CNT, ADDV
+	// -----------------------------------------------------------------------
+
+	case OpCNT, OpADDV:
+		d, n, err := e.decodeReg2(inst)
+		if err != nil {
+			return nil, err
+		}
+		base := uint32(0x0E205800) // CNT Vd.8B, Vn.8B
+		if op == OpADDV {
+			base = 0x0E31B800 // ADDV Bd, Vn.8B
+		}
+		return enc(base | reg(n)<<5 | reg(d)), nil
+
 	case OpFMADD, OpFMSUB, OpFNMADD, OpFNMSUB:
 		ft := floatTernaryOpcodes[op]
 		return e.encodeFloatTernary(ft.single, ft.double, inst)

diff --git a/asm/arm64/encoder_test.go b/asm/arm64/encoder_test.go
@@ -82,6 +82,12 @@ func TestEncoder_Encode(t *testing.T) {
 			{"FSUB D1,D2,D3", FSUB(D1, D2, D3), 0x1E633841},
 			{"FMUL D1,D2,D3", FMUL(D1, D2, D3), 0x1E630841},
 			{"FDIV D1,D2,D3", FDIV(D1, D2, D3), 0x1E631841},
+			{"FMIN D1,D2,D3", FMIN(D1, D2, D3), 0x1E635841},
+			{"FMIN S1,S2,S3", FMIN(S1, S2, S3), 0x1E235841},
+			{"FMAX D1,D2,D3", FMAX(D1, D2, D3), 0x1E634841},
+			{"FMAX S1,S2,S3", FMAX(S1, S2, S3), 0x1E234841},
+			{"CNT D1,D2", CNT(D1, D2), 0x0E205841},
+			{"ADDV D1,D2", ADDV(D1, D2), 0x0E31B841},
 			{"FMADD D0,D1,D2,D3", FMADD(D0, D1, D2, D3), 0x1F420C20},
 			{"FMSUB D0,D1,D2,D3", FMSUB(D0, D1, D2, D3), 0x1F428C20},
 			{"FNMADD D0,D1,D2,D3", FNMADD(D0, D1, D2, D3), 0x1F620C20},

diff --git a/asm/arm64/instr.go b/asm/arm64/instr.go
@@ -117,6 +117,8 @@ const (
 	OpFSUB
 	OpFMUL
 	OpFDIV
+	OpFMIN
+	OpFMAX
 	OpFMADD
 	OpFMSUB
 	OpFNMADD
@@ -131,6 +133,10 @@ const (
 	OpFRINTP
 	OpFRINTZ
 
+	// SIMD (fixed 8B arrangement)
+	OpCNT
+	OpADDV
+
 	// Float move / compare
 	OpFMOV
 	OpFCMP
@@ -489,6 +495,8 @@ func FADD(dst, src1, src2 asm.Reg) asm.Instruction { return newReg3(OpFADD, dst,
 func FSUB(dst, src1, src2 asm.Reg) asm.Instruction { return newReg3(OpFSUB, dst, src1, src2) }
 func FMUL(dst, src1, src2 asm.Reg) asm.Instruction { return newReg3(OpFMUL, dst, src1, src2) }
 func FDIV(dst, src1, src2 asm.Reg) asm.Instruction { return newReg3(OpFDIV, dst, src1, src2) }
+func FMIN(dst, src1, src2 asm.Reg) asm.Instruction { return newReg3(OpFMIN, dst, src1, src2) }
+func FMAX(dst, src1, src2 asm.Reg) asm.Instruction { return newReg3(OpFMAX, dst, src1, src2) }
 
 // FMADD Dd, Dn, Dm, Da  →  Dd = Da + Dn*Dm
 func FMADD(dst, src1, src2, acc asm.Reg) asm.Instruction {
@@ -519,6 +527,12 @@ func FRINTM(dst, src asm.Reg) asm.Instruction { return newReg2(OpFRINTM, dst, sr
 func FRINTP(dst, src asm.Reg) asm.Instruction { return newReg2(OpFRINTP, dst, src) }
 func FRINTZ(dst, src asm.Reg) asm.Instruction { return newReg2(OpFRINTZ, dst, src) }
 
+// CNT Vd.8B, Vn.8B  →  per-byte population count.
+// ADDV Bd, Vn.8B    →  sum the 8 byte lanes into the low byte of Vd.
+// Both take SIMD V registers (fixed 8-byte arrangement).
+func CNT(dst, src asm.Reg) asm.Instruction  { return newReg2(OpCNT, dst, src) }
+func ADDV(dst, src asm.Reg) asm.Instruction { return newReg2(OpADDV, dst, src) }
+
 // ---------------------------------------------------------------------------
 // Float-point move / compare
 // ---------------------------------------------------------------------------

diff --git a/docs/instruction-set.md b/docs/instruction-set.md
@@ -74,6 +74,7 @@ Offsets are signed 16-bit values encoded little-endian. `BR 5` skips 5 bytes pas
 | `BR_TABLE` | `{-2, 2}` | `index →` | ◐ | Jump table; negative or out-of-range index uses default target. JIT only for simple stack shapes. |
 | `CALL` | `{}` | `fn →` | ◐ | Call `*Function`, `*HostFunction`, or `*Closure`; trace JIT lowers observed direct calls, small same-arity function-value indirect dispatches, and eligible closure-body calls to native `BL`. Host calls and misses fall back. |
 | `RETURN` | `{}` | `→` | ◐ | Return from current frame; trace JIT lowers entry returns and stitches inlined callee returns. |
+| `RETURN_CALL` | `{}` | `args… fn →` | ◐ | Tail call: pops args + funcref like `CALL`, but reuses the current frame so tail recursion runs in constant frame depth. Above the entry frame the frame is replaced in place; at the entry frame a new frame is pushed (callee returns to the entry frame normally). Target must be a `*Function` or `*Closure`; a host-function target is invoked in place and its results returned. Result arity should match the current function's. Trace JIT lowers plain-function targets: a tail call back to the trace anchor becomes a native loop back-edge (self/mutual recursion in constant depth), a tail call to another function morphs the frame in place. Host and closure targets fall back. |
 
 ## Variables
 
@@ -129,6 +130,13 @@ A `ref`-typed slot is the VM's dynamic ("any") type: it holds any `Boxed` — an
 | `I32_AND` | `{}` | `a b → i32` | ✅ | Bitwise AND. |
 | `I32_OR` | `{}` | `a b → i32` | ✅ | Bitwise OR. |
 | `I32_XOR` | `{}` | `a b → i32` | ✅ | Bitwise XOR. |
+| `I32_CLZ` | `{}` | `x → i32` | ✅ | Count leading zero bits (`32` if `x == 0`). |
+| `I32_CTZ` | `{}` | `x → i32` | ✅ | Count trailing zero bits (`32` if `x == 0`). |
+| `I32_POPCNT` | `{}` | `x → i32` | ✅ | Count set bits. |
+| `I32_ROTL` | `{}` | `a b → i32` | ✅ | Rotate `a` left by `b` (modulo 32). |
+| `I32_ROTR` | `{}` | `a b → i32` | ✅ | Rotate `a` right by `b` (modulo 32). |
+| `I32_EXTEND8_S` | `{}` | `x → i32` | ✅ | Sign-extend low 8 bits to i32. |
+| `I32_EXTEND16_S` | `{}` | `x → i32` | ✅ | Sign-extend low 16 bits to i32. |
 | `I32_EQZ` | `{}` | `x → i32` | ✅ | Push `I32(1)` if zero. |
 | `I32_EQ` | `{}` | `a b → i32` | ✅ | Equality comparison. |
 | `I32_NE` | `{}` | `a b → i32` | ✅ | Inequality comparison. |
@@ -146,6 +154,7 @@ A `ref`-typed slot is the VM's dynamic ("any") type: it holds any `Boxed` — an
 | `I32_TO_F32_U` | `{}` | `i32 → f32` | ✅ | Convert unsigned i32 to f32. |
 | `I32_TO_F64_S` | `{}` | `i32 → f64` | ✅ | Convert signed i32 to f64. |
 | `I32_TO_F64_U` | `{}` | `i32 → f64` | ✅ | Convert unsigned i32 to f64. |
+| `I32_REINTERPRET_F32` | `{}` | `f32 → i32` | ✅ | Reinterpret f32 bit pattern as i32 (no conversion). |
 
 ## i64 Operations
 
@@ -162,13 +171,25 @@ A `ref`-typed slot is the VM's dynamic ("any") type: it holds any `Boxed` — an
 | `I64_SHL` | `{}` | `a b → i64` | ✅ | Left shift; amount uses low 6 bits. |
 | `I64_SHR_S` | `{}` | `a b → i64` | ✅ | Arithmetic right shift. |
 | `I64_SHR_U` | `{}` | `a b → i64` | ✅ | Logical right shift. |
+| `I64_XOR` | `{}` | `a b → i64` | ✅ | Bitwise XOR. |
+| `I64_AND` | `{}` | `a b → i64` | ✅ | Bitwise AND. |
+| `I64_OR` | `{}` | `a b → i64` | ✅ | Bitwise OR. |
+| `I64_CLZ` | `{}` | `x → i64` | ✅ | Count leading zero bits (`64` if `x == 0`). |
+| `I64_CTZ` | `{}` | `x → i64` | ✅ | Count trailing zero bits (`64` if `x == 0`). |
+| `I64_POPCNT` | `{}` | `x → i64` | ✅ | Count set bits. |
+| `I64_ROTL` | `{}` | `a b → i64` | ✅ | Rotate `a` left by `b` (modulo 64). |
+| `I64_ROTR` | `{}` | `a b → i64` | ✅ | Rotate `a` right by `b` (modulo 64). |
+| `I64_EXTEND8_S` | `{}` | `x → i64` | ✅ | Sign-extend low 8 bits to i64. |
+| `I64_EXTEND16_S` | `{}` | `x → i64` | ✅ | Sign-extend low 16 bits to i64. |
+| `I64_EXTEND32_S` | `{}` | `x → i64` | ✅ | Sign-extend low 32 bits to i64. |
 | `I64_EQZ` | `{}` | `x → i32` | ✅ | Push `I32(1)` if zero. |
 | `I64_EQ` … `I64_GE_U` | `{}` | `a b → i32` | ✅ | Same semantics as i32 comparisons. |
 | `I64_TO_I32` | `{}` | `i64 → i32` | ✅ | Truncate to low 32 bits. |
 | `I64_TO_F32_S` | `{}` | `i64 → f32` | ✅ | Convert signed i64 to f32. |
 | `I64_TO_F32_U` | `{}` | `i64 → f32` | ✅ | Convert unsigned i64 to f32. |
 | `I64_TO_F64_S` | `{}` | `i64 → f64` | ✅ | Convert signed i64 to f64. |
 | `I64_TO_F64_U` | `{}` | `i64 → f64` | ✅ | Convert unsigned i64 to f64. |
+| `I64_REINTERPRET_F64` | `{}` | `f64 → i64` | ✅ | Reinterpret f64 bit pattern as i64 (no conversion). |
 
 ## f32 Operations
 
@@ -179,12 +200,23 @@ A `ref`-typed slot is the VM's dynamic ("any") type: it holds any `Boxed` — an
 | `F32_SUB` | `{}` | `a b → f32` | ✅ | Floating-point subtraction. |
 | `F32_MUL` | `{}` | `a b → f32` | ✅ | Floating-point multiplication. |
 | `F32_DIV` | `{}` | `a b → f32` | ✅ | Floating-point division. |
+| `F32_ABS` | `{}` | `x → f32` | ✅ | Absolute value (clears sign bit). |
+| `F32_NEG` | `{}` | `x → f32` | ✅ | Negate (flips sign bit, incl. NaN). |
+| `F32_SQRT` | `{}` | `x → f32` | ✅ | Square root. |
+| `F32_CEIL` | `{}` | `x → f32` | ✅ | Round toward +∞. |
+| `F32_FLOOR` | `{}` | `x → f32` | ✅ | Round toward −∞. |
+| `F32_TRUNC` | `{}` | `x → f32` | ✅ | Round toward zero. |
+| `F32_NEAREST` | `{}` | `x → f32` | ✅ | Round to nearest, ties to even. |
+| `F32_MIN` | `{}` | `a b → f32` | ✅ | Minimum; NaN propagates, `min(-0,+0)=-0`. |
+| `F32_MAX` | `{}` | `a b → f32` | ✅ | Maximum; NaN propagates, `max(-0,+0)=+0`. |
+| `F32_COPYSIGN` | `{}` | `a b → f32` | ✅ | Magnitude of `a` with sign of `b`. |
 | `F32_EQ` … `F32_GE` | `{}` | `a b → i32` | ✅ | Floating-point comparisons. |
-| `F32_TO_I32_S` | `{}` | `f32 → i32` | ✅ | Truncate to signed i32. |
-| `F32_TO_I32_U` | `{}` | `f32 → i32` | ✅ | Truncate to unsigned i32. |
-| `F32_TO_I64_S` | `{}` | `f32 → i64` | ✅ | Truncate to signed i64. |
-| `F32_TO_I64_U` | `{}` | `f32 → i64` | ✅ | Truncate to unsigned i64. |
+| `F32_TO_I32_S` | `{}` | `f32 → i32` | ✅ | Truncate toward zero to signed i32, saturating (NaN→0, out-of-range→nearest bound). |
+| `F32_TO_I32_U` | `{}` | `f32 → i32` | ✅ | Truncate toward zero to unsigned i32, saturating (NaN/negative→0, overflow→`u32` max). |
+| `F32_TO_I64_S` | `{}` | `f32 → i64` | ✅ | Truncate toward zero to signed i64, saturating (NaN→0, out-of-range→nearest bound). |
+| `F32_TO_I64_U` | `{}` | `f32 → i64` | ✅ | Truncate toward zero to unsigned i64, saturating (NaN/negative→0, overflow→`u64` max). |
 | `F32_TO_F64` | `{}` | `f32 → f64` | ✅ | Widen f32 to f64. |
+| `F32_REINTERPRET_I32` | `{}` | `i32 → f32` | ✅ | Reinterpret i32 bit pattern as f32 (no conversion). |
 
 ## f64 Operations
 
@@ -195,12 +227,23 @@ A `ref`-typed slot is the VM's dynamic ("any") type: it holds any `Boxed` — an
 | `F64_SUB` | `{}` | `a b → f64` | ✅ | Floating-point subtraction. |
 | `F64_MUL` | `{}` | `a b → f64` | ✅ | Floating-point multiplication. |
 | `F64_DIV` | `{}` | `a b → f64` | ✅ | Floating-point division. |
+| `F64_ABS` | `{}` | `x → f64` | ✅ | Absolute value (clears sign bit). |
+| `F64_NEG` | `{}` | `x → f64` | ✅ | Negate (flips sign bit, incl. NaN). |
+| `F64_SQRT` | `{}` | `x → f64` | ✅ | Square root. |
+| `F64_CEIL` | `{}` | `x → f64` | ✅ | Round toward +∞. |
+| `F64_FLOOR` | `{}` | `x → f64` | ✅ | Round toward −∞. |
+| `F64_TRUNC` | `{}` | `x → f64` | ✅ | Round toward zero. |
+| `F64_NEAREST` | `{}` | `x → f64` | ✅ | Round to nearest, ties to even. |
+| `F64_MIN` | `{}` | `a b → f64` | ✅ | Minimum; NaN propagates, `min(-0,+0)=-0`. |
+| `F64_MAX` | `{}` | `a b → f64` | ✅ | Maximum; NaN propagates, `max(-0,+0)=+0`. |
+| `F64_COPYSIGN` | `{}` | `a b → f64` | ✅ | Magnitude of `a` with sign of `b`. |
 | `F64_EQ` … `F64_GE` | `{}` | `a b → i32` | ✅ | Floating-point comparisons. |
-| `F64_TO_I32_S` | `{}` | `f64 → i32` | ✅ | Truncate to signed i32. |
-| `F64_TO_I32_U` | `{}` | `f64 → i32` | ✅ | Truncate to unsigned i32. |
-| `F64_TO_I64_S` | `{}` | `f64 → i64` | ✅ | Truncate to signed i64. |
-| `F64_TO_I64_U` | `{}` | `f64 → i64` | ✅ | Truncate to unsigned i64. |
+| `F64_TO_I32_S` | `{}` | `f64 → i32` | ✅ | Truncate toward zero to signed i32, saturating (NaN→0, out-of-range→nearest bound). |
+| `F64_TO_I32_U` | `{}` | `f64 → i32` | ✅ | Truncate toward zero to unsigned i32, saturating (NaN/negative→0, overflow→`u32` max). |
+| `F64_TO_I64_S` | `{}` | `f64 → i64` | ✅ | Truncate toward zero to signed i64, saturating (NaN→0, out-of-range→nearest bound). |
+| `F64_TO_I64_U` | `{}` | `f64 → i64` | ✅ | Truncate toward zero to unsigned i64, saturating (NaN/negative→0, overflow→`u64` max). |
 | `F64_TO_F32` | `{}` | `f64 → f32` | ✅ | Narrow f64 to f32. |
+| `F64_REINTERPRET_I64` | `{}` | `i64 → f64` | ⬜ | Reinterpret i64 bit pattern as f64 (no conversion). |
 
 ## String Operations
 
@@ -249,3 +292,4 @@ Map keys use primitive value identity for `i32`, `i64`, `f32`, and `f64`; all re
 | `MAP_SET` | `{}` | `map key value →` | ◐ | Insert or replace entry. JIT keeps framed entries by exiting locally to the threaded handler. |
 | `MAP_DELETE` | `{}` | `map key →` | ◐ | Delete entry; missing key is a no-op. JIT keeps framed entries by exiting locally to the threaded handler. |
 | `MAP_CLEAR` | `{}` | `map →` | ◐ | Delete all entries. JIT keeps framed entries by exiting locally to the threaded handler. |
+| `MAP_KEYS` | `{}` | `map → array` | ⬜ | Snapshot keys into a new `[]K` array (`K` = map key type), in unspecified order. Enables guest map iteration with `ARRAY_LEN`/`ARRAY_GET` + `MAP_GET`. |
diff --git a/instr/opcode.go b/instr/opcode.go
@@ -30,6 +30,7 @@ const (
 
 	CALL
 	RETURN
+	RETURN_CALL
 
 	GLOBAL_GET
 	GLOBAL_SET
@@ -74,6 +75,15 @@ const (
 	I32_AND
 	I32_OR
 
+	I32_CLZ
+	I32_CTZ
+	I32_POPCNT
+	I32_ROTL
+	I32_ROTR
+
+	I32_EXTEND8_S
+	I32_EXTEND16_S
+
 	I32_EQZ
 	I32_EQ
 	I32_NE
@@ -93,6 +103,8 @@ const (
 	I32_TO_F64_U
 	I32_TO_F64_S
 
+	I32_REINTERPRET_F32
+
 	I64_CONST
 
 	I64_ADD
@@ -106,6 +118,20 @@ const (
 	I64_SHR_S
 	I64_SHR_U
 
+	I64_XOR
+	I64_AND
+	I64_OR
+
+	I64_CLZ
+	I64_CTZ
+	I64_POPCNT
+	I64_ROTL
+	I64_ROTR
+
+	I64_EXTEND8_S
+	I64_EXTEND16_S
+	I64_EXTEND32_S
+
 	I64_EQZ
 	I64_EQ
 	I64_NE
@@ -124,13 +150,26 @@ const (
 	I64_TO_F64_S
 	I64_TO_F64_U
 
+	I64_REINTERPRET_F64
+
 	F32_CONST
 
 	F32_ADD
 	F32_SUB
 	F32_MUL
 	F32_DIV
 
+	F32_ABS
+	F32_NEG
+	F32_SQRT
+	F32_CEIL
+	F32_FLOOR
+	F32_TRUNC
+	F32_NEAREST
+	F32_MIN
+	F32_MAX
+	F32_COPYSIGN
+
 	F32_EQ
 	F32_NE
 	F32_LT
@@ -144,13 +183,26 @@ const (
 	F32_TO_I64_U
 	F32_TO_F64
 
+	F32_REINTERPRET_I32
+
 	F64_CONST
 
 	F64_ADD
 	F64_SUB
 	F64_MUL
 	F64_DIV
 
+	F64_ABS
+	F64_NEG
+	F64_SQRT
+	F64_CEIL
+	F64_FLOOR
+	F64_TRUNC
+	F64_NEAREST
+	F64_MIN
+	F64_MAX
+	F64_COPYSIGN
+
 	F64_EQ
 	F64_NE
 	F64_LT
@@ -164,6 +216,8 @@ const (
 	F64_TO_I64_U
 	F64_TO_F32
 
+	F64_REINTERPRET_I64
+
 	STRING_NEW_UTF32
 
 	STRING_LEN
@@ -202,6 +256,7 @@ const (
 	MAP_SET
 	MAP_DELETE
 	MAP_CLEAR
+	MAP_KEYS
 
 	CLOSURE_NEW
 )