Documentation

Interval.Floating.Add

Floating point addition and subtraction #

To add, we shift the smaller number to near the bigger number, perform a 128-bit addition, and normalize the result.

Normalization starting with 128 bits #

@[irreducible]

def Floating.add_adjust (l s : UInt64) :

UInt64 × UInt64

Adjust an exponent for addition. The result is (t, d+1), where d - 65 is the amount to shift left by, and t is the final exponent.

Equations

Floating.add_adjust l s = bif l == 127 then (s + 1, 0) else match Floating.lower (126 - l) s with | (s, d) => (s, d + 1)

Instances For

def Floating.add_n (r : UInt128) (s : UInt64) (up : Bool) :

Abbreviation for our expanded n value

Equations

One or more equations did not get rendered due to their size.

Instances For

theorem Floating.add_adjust_2_eq {r : UInt128} {s : UInt64} (g : r.log2 ≠ 127 ∨ s ≠ UInt64.max) :

↑(add_adjust r.log2 s).2.toNat = ↑s.toNat + 1 - ↑(add_adjust r.log2 s).1.toNat

The difference result expressed as a difference

theorem Floating.adjust_d_le (r : UInt128) (s : UInt64) :

(add_adjust r.log2 s).2.toNat ≤ 127 - r.toNat.log2

add_adjust produces small d + 1

theorem Floating.adjust_r_lt_128 (r : UInt128) (s : UInt64) :

r.toNat < 2 ^ (128 - (add_adjust r.log2 s).2.toNat)

add_adjust doesn't overflow r

theorem Floating.adjust_le_r {r : UInt128} {s : UInt64} (r0 : r.toNat ≠ 0) (s0 : (add_adjust r.log2 s).1.toNat ≠ 0) :

2 ^ (127 - (add_adjust r.log2 s).2.toNat) ≤ r.toNat

add_adjust normalizes r

theorem Floating.adjust_mul_lt_128 (r : UInt128) (s : UInt64) :

r.toNat * 2 ^ (add_adjust r.log2 s).2.toNat < 2 ^ 128

add_adjust doesn't overflow r

theorem Floating.adjust_le_mul {r : UInt128} {s : UInt64} (r0 : r.toNat ≠ 0) (s0 : (add_adjust r.log2 s).1.toNat ≠ 0) :

2 ^ 127 ≤ r.toNat * 2 ^ (add_adjust r.log2 s).2.toNat

add_adjust normalizes r

theorem Floating.adjust_lo_eq {r : UInt128} {s : UInt64} (a65 : 65 ≤ (add_adjust r.log2 s).2.toNat) :

r.lo.toNat = r.toNat

theorem Floating.adjust_shift_le_63 (r : UInt128) (s : UInt64) (up : Bool) (a65 : (add_adjust r.log2 s).2.toNat < 65) :

(r.shiftRightRound (65 - (add_adjust r.log2 s).2) up).toNat ≤ 2 ^ 63

add_adjust doesn't make r too big

theorem Floating.adjust_lo_shift_le_63 (r : UInt128) (s : UInt64) (up : Bool) (a65 : (add_adjust r.log2 s).2.toNat < 65) :

(r.shiftRightRound (65 - (add_adjust r.log2 s).2) up).lo.toNat ≤ 2 ^ 63

add_adjust doesn't make r too big

theorem Floating.add_n_le (r : UInt128) (s : UInt64) (up : Bool) :

(add_n r s up).toUInt64.toNat ≤ 2 ^ 63

add_n produces small n values

theorem Floating.add_n_lt (r : UInt128) (s : UInt64) (up : Bool) (n63 : (add_n r s up).toUInt64 ≠ 2 ^ 63) :

(add_n r s up).toUInt64.toNat < 2 ^ 63

add_n produces small n values

theorem Floating.add_n_ne_min (r : UInt128) (s : UInt64) (up : Bool) (n63 : (add_n r s up).toUInt64 ≠ 2 ^ 63) :

add_n r s up ≠ Int64.minValue

add_adjust never produces n = .min

theorem Floating.coe_add_n (r : UInt128) (s : UInt64) (up : Bool) :

↑(add_n r s up).toUInt64.toNat = (↑r.toNat * 2 ^ (add_adjust r.log2 s).2.toNat).rdiv (2 ^ 65) up

add_n respects ℕ conversion

theorem Floating.coe_add_z {r : UInt128} {s : UInt64} {up : Bool} (n63 : (add_n r s up).toUInt64 ≠ 2 ^ 63) :

↑(add_n r s up) = (↑r.toNat * 2 ^ (add_adjust r.log2 s).2.toNat).rdiv (2 ^ 65) up

add_n respects ℤ conversion

theorem Floating.add_n_norm (r : UInt128) (s : UInt64) (up : Bool) :

add_n r s up ≠ 0 → add_n r s up ≠ Int64.minValue → (add_adjust r.log2 s).1 ≠ 0 → 2 ^ 62 ≤ (add_n r s up).toUInt64.toNat

add_adjust results in normalized n

theorem Floating.valid_small_shift0 {s : UInt64} :

Valid (2 ^ 62) (s + 1)

small_shift is valid

theorem Floating.valid_small_shift1 {n s : UInt64} (n63 : n ≠ 2 ^ 63) (n0 : n ≠ 0) (le_n : n.toInt64 ≠ 0 → n.toInt64 ≠ Int64.minValue → s ≠ 0 → 2 ^ 62 ≤ n.toNat) (n_le : n.toNat ≤ 2 ^ 63) :

Valid n.toInt64 s

small_shift is valid

@[irreducible]

def Floating.small_shift (n s : UInt64) (le_n : n.toInt64 ≠ 0 → n.toInt64 ≠ Int64.minValue → s ≠ 0 → 2 ^ 62 ≤ n.toNat) (n_le : n.toNat ≤ 2 ^ 63) :

Turn an almost normalized (n ≤ 2^63) value into a Floating, shifting right by at most 1

Equations

One or more equations did not get rendered due to their size.

Instances For

@[irreducible]

def Floating.add_shift_r (r : UInt128) (s : UInt64) (up : Bool) :

Turn an r * 2^(s - 64 - 2^63) value into a Floating

Equations

One or more equations did not get rendered due to their size.

Instances For

theorem Floating.val_small_shift {n s : UInt64} {le_n : n.toInt64 ≠ 0 → n.toInt64 ≠ Int64.minValue → s ≠ 0 → 2 ^ 62 ≤ n.toNat} {n_le : n.toNat ≤ 2 ^ 63} (sn : small_shift n s le_n n_le ≠ nan) :

(small_shift n s le_n n_le).val = ↑n.toNat * 2 ^ (↑s.toNat - 2 ^ 63)

small_shift is correct

theorem Floating.val_add_shift_r (r : UInt128) (s : UInt64) (up : Bool) :

Rounds (add_shift_r r s up) (↑r * 2 ^ (↑s.toNat - 64 - 2 ^ 63)) up

add_shift_r rounds in the correct direction

High 64 bit - 128 bit subtraction #

@[irreducible, inline]

def Floating.hi_sub_128 (x : UInt64) (y : UInt128) :

x * 2^64 - y

Equations

Floating.hi_sub_128 x y = { lo := (-y).lo, hi := x + (-y).hi }

Instances For

theorem Floating.toNat_hi_sub_128 {x : UInt64} {y : UInt128} (yx : y.toNat ≤ x.toNat * 2 ^ 64) :

(hi_sub_128 x y).toNat = x.toNat * 2 ^ 64 - y.toNat

hi_sub_128 is correct

Addition (definition) #

@[irreducible, inline]

def Floating.add_to_128 (x y : Floating) (up : Bool) :

Exactly rounded floating point addition, with 0 < x and special cases removed

Equations

One or more equations did not get rendered due to their size.

Instances For

@[irreducible, inline]

def Floating.pos_add (x y : Floating) (up : Bool) :

Exactly rounded floating point addition, with 0 < x and special cases removed

Equations

x.pos_add y up = Floating.add_shift_r (x.add_to_128 y up) x.s up

Instances For

@[irreducible, inline]

def Floating.add_core (x y : Floating) (up : Bool) :

Exactly rounded floating point addition, with most special cases removed

Equations

x.add_core y up = match bif x.n.isNeg then (true , -x, -y) else (false , x, y) with | (z, x, y) => have r := x.pos_add y (up != z); bif z then -r else r

Instances For

@[irreducible]

def Floating.add_bigger (x y : Floating) :

Absolute value comparison, ignoring special cases

Equations

x.add_bigger y = bif x.s == y.s then decide (y.n.uabs ≤ x.n.uabs) else decide (y.s < x.s)

Instances For

@[irreducible]

def Floating.add (x y : Floating) (up : Bool) :

Exactly rounded floating point addition

Equations

x.add y up = bif x == 0 || y == nan then y else bif y == 0 || x == nan then x else match bif x.add_bigger y then (x, y) else (y, x) with | (x, y) => x.add_core y up

Instances For

Addition is correct #

@[simp]

theorem Floating.add_bigger_neg {x y : Floating} :

(-x).add_bigger (-y) = x.add_bigger y

add_bigger doesn't care about sign

@[simp]

theorem Floating.add_bigger_abs {x y : Floating} :

x.add_bigger y.abs = x.add_bigger y

add_bigger doesn't care about abs

@[simp]

theorem Floating.not_add_bigger {x y : Floating} (h : ¬x.add_bigger y = true) :

y.add_bigger x = true

add_bigger is antisymmetric

theorem Floating.add_bigger_s {x y : Floating} (h : x.add_bigger y = true) :

y.s ≤ x.s

theorem Floating.add_bigger_n {x y : Floating} (h : x.add_bigger y = true) (xn : 0 ≤ x.n) (e : x.s = y.s) :

y.n.uabs ≤ x.n.toUInt64

theorem Floating.add_to_128_shift_lt {y : Int64} {s : UInt64} {up : Bool} (yn : 0 ≤ y) :

({ lo := 0, hi := y.toUInt64 }.shiftRightRound s up).hi.toNat < 2 ^ 63

The shifting we do of y in add_to_128 produces a small value

theorem Floating.add_to_128_shift_le {x y : Floating} (xy : x.add_bigger y = true) {up : Bool} (xn : 0 ≤ x.n) (yn : 0 ≤ y.n) :

({ lo := 0, hi := y.n.toUInt64 }.shiftRightRound (x.s - y.s) up).toNat ≤ x.n.toUInt64.toNat * 2 ^ 64

The shifting we do of y in add_to_128 produces a small value

@[simp]

theorem Floating.mul_64_pow (x : ℝ) (s : ℤ) :

x * 2 ^ 64 * 2 ^ (s - 64 - 2 ^ 63) = x * 2 ^ (s - 2 ^ 63)

Common alternative spelling of val

theorem Floating.val_64 (x : Floating) :

x.val = ↑↑x.n * 2 ^ 64 * 2 ^ (↑x.s.toNat - 64 - 2 ^ 63)

Common alternative spelling of val

theorem Floating.val_add_to_128 {x y : Floating} (up : Bool) (yn : y ≠ nan) (y0 : y ≠ 0) (xy : x.add_bigger y = true) (x0' : 0 ≤ x.n) :

Rounds (↑(x.add_to_128 y up) * 2 ^ (↑x.s.toNat - 64 - 2 ^ 63)) (x.val + y.val) up

add_to_128 rounds in the correct direction

theorem Floating.val_pos_add {x y : Floating} {up : Bool} (yn : y ≠ nan) (y0 : y ≠ 0) (xy : x.add_bigger y = true) (x0' : 0 ≤ x.n) :

Rounds (x.pos_add y up) (x.val + y.val) up

pos_add rounds in the correct direction

theorem Floating.val_add_core {x y : Floating} {up : Bool} (xn : x ≠ nan) (yn : y ≠ nan) (x0 : x ≠ 0) (y0 : y ≠ 0) (xy : x.add_bigger y = true) :

Rounds (x.add_core y up) (x.val + y.val) up

add_core rounds in the correct direction

theorem Floating.approx_add (x y : Floating) (up : Bool) {x' y' : ℝ} (ax : approx x x') (ay : approx y y') :

Rounds (x.add y up) (x' + y') up

add rounds in the correct direction

@[simp]

theorem Floating.add_nan {x : Floating} {up : Bool} :

x.add nan up = nan

add propagates nan

@[simp]

theorem Floating.nan_add {x : Floating} {up : Bool} :

nan.add x up = nan

add propagates nan

theorem Floating.ne_nan_of_add {x y : Floating} {up : Bool} (n : x.add y up ≠ nan) :

x ≠ nan ∧ y ≠ nan

add propagates nan

theorem Floating.add_le {x y : Floating} (n : x.add y false ≠ nan) :

(x.add y false).val ≤ x.val + y.val

add _ _ false rounds down

theorem Floating.le_add {x y : Floating} (n : x.add y true ≠ nan) :

x.val + y.val ≤ (x.add y true).val

add _ _ true rounds up

@[simp]

theorem Floating.add_zero (x : Floating) (up : Bool) :

x.add 0 up = x

@[simp]

theorem Floating.zero_add (x : Floating) (up : Bool) :

add 0 x up = x

Subtraction #

We use x - y = x + -y.

@[irreducible]

def Floating.sub (x y : Floating) (up : Bool) :

Equations

x.sub y up = x.add (-y) up

Instances For

theorem Floating.sub_eq_add_neg (x y : Floating) (up : Bool) :

x.sub y up = x.add (-y) up

Definition lemma for easy of use

theorem Floating.approx_sub {x' y' : ℝ} (x y : Floating) (up : Bool) (ax : approx x x') (ay : approx y y') :

Rounds (x.sub y up) (x' - y') up

sub rounds in the correct direction

@[simp]

theorem Floating.sub_nan {x : Floating} {up : Bool} :

x.sub nan up = nan

sub propagates nan

@[simp]

theorem Floating.nan_sub {x : Floating} {up : Bool} :

nan.sub x up = nan

sub propagates nan

theorem Floating.ne_nan_of_sub {x y : Floating} {up : Bool} (n : x.sub y up ≠ nan) :

x ≠ nan ∧ y ≠ nan

sub propagates nan

theorem Floating.sub_le {x y : Floating} (n : x.sub y false ≠ nan) :

(x.sub y false).val ≤ x.val - y.val

sub _ _ false rounds down

theorem Floating.le_sub {x y : Floating} (n : x.sub y true ≠ nan) :

x.val - y.val ≤ (x.sub y true).val

sub _ _ true rounds up

@[simp]

theorem Floating.sub_zero (x : Floating) (up : Bool) :

x.sub 0 up = x

@[simp]

theorem Floating.zero_sub (x : Floating) (up : Bool) :

sub 0 x up = -x