src/asm_gcd_unsigned.h

namespace asm_code {


struct asm_integer {
    //if a sign limb exists, it is one qword before this address. the data limbs are after this address
    reg_scalar addr_base;

    //the asm_integer functions only use addr_base. this is used to assign addr_base if it needs to be allocated
    reg_spill addr_base_spill;

    int addr_offset=0;

    bool is_signed=false;
    int size=0; //limbs. lsb limb is first. this is a multiple of 4

    asm_integer() {}
    asm_integer(reg_spill t_spill, int t_size) {
        addr_base_spill=t_spill;
        size=t_size;
    }

    string operator[](int pos) {
        assert(pos>=0 && pos<size);
        return str( "[#+#]", addr_base.name(), to_hex(addr_offset+pos*8) );
    }

    bool is_null() {
        return size==0;
    }

    //end_index will return the number of nonzero limbs minus 1
    //end_index should initially be >= the number nonzero of limbs minus 1, but not more than size-1
    //if the integer is 0, end_index should initially be at least 0 and the returned end_index is 0
    //regs: 3x scalar
    void update_end_index(reg_alloc regs, reg_scalar end_index) {
        EXPAND_MACROS_SCOPE;

        assert(size%4==0);
        assert(addr_offset==0); //can temporarily modify addr_base if this is false

        m.bind(end_index, "end_index");
        m.bind(addr_base, "addr_base");
        reg_scalar tmp_value=regs.bind_scalar(m, "tmp_value");
        reg_scalar tmp_0=regs.bind_scalar(m, "tmp_0");
        reg_scalar tmp_8=regs.bind_scalar(m, "tmp_8");

        //convert index to address
        APPEND_M(str( "LEA `end_index, [`addr_base+`end_index*8]" ));

        APPEND_M(str( "XOR `tmp_0, `tmp_0" ));
        APPEND_M(str( "MOV `tmp_8, 8" ));

        string loop_label=m.alloc_label();

        const int num_unroll=2;
        assert(num_unroll>=1);

        for (int x=0;x<num_unroll;++x) {
            if (x==num_unroll-1) {
                APPEND_M(str( "#:", loop_label ));
            }

            APPEND_M(str( "MOV `tmp_value, [`end_index]" ));

            //tmp_value=(tmp_value==0)? 8 : 0
            //(8 if the last limb is 0, else 0)
            APPEND_M(str( "CMP `tmp_value, `tmp_0" ));
            APPEND_M(str( "MOV `tmp_value, `tmp_0" ));
            APPEND_M(str( "CMOVE `tmp_value, `tmp_8" ));

            //if (end_index==end_addr) tmp_value=0
            //(sets tmp_value to 0 if there is only 1 limb left)
            APPEND_M(str( "CMP `end_index, `addr_base" ));
            APPEND_M(str( "CMOVE `tmp_value, `tmp_0" ));

            //if tmp_value==8, go to the next lowest limb
            //if tmp_value==0, do nothing
            APPEND_M(str( "SUB `end_index, `tmp_value" ));

            if (x==1) {
                //keep looping until end_index stops changing
                APPEND_M(str( "CMP `tmp_value, `tmp_0" ));
                APPEND_M(str( "JNE #", track_asm( "update_end_index loop", loop_label ) ));
            }
        }

        //convert address to index
        APPEND_M(str( "SUB `end_index, `addr_base" ));
        APPEND_M(str( "SHR `end_index, 3" ));
    }

    //end_index=(end_index<2)? 0 : end_index-2
    //regs: 1x scalar
    void calculate_head_start(reg_alloc regs, reg_scalar end_index) {
        EXPAND_MACROS_SCOPE;

        assert(size%4==0);

        m.bind(end_index, "end_index");

        reg_scalar tmp=regs.bind_scalar(m, "tmp");

        APPEND_M(str( "XOR `tmp, `tmp" ));
        APPEND_M(str( "SUB `end_index, 2" ));
        APPEND_M(str( "CMOVB `end_index, `tmp" ));
    }

    //this is the same as extract_head, except that extracts at nonzero_size
    //nonzero_size should be >= the actual nonzero size to avoid truncation
    //regs: 1x scalar
    void extract_head_at(reg_alloc regs, reg_scalar head_start, array<reg_scalar, 3> res) {
        EXPAND_MACROS_SCOPE;

        assert(size%4==0);

        m.bind(addr_base, "addr_base");
        m.bind(head_start, "head_start");
        m.bind(res, "res");

        reg_scalar tmp_addr=regs.bind_scalar(m, "tmp_addr");

        APPEND_M(str( "LEA `tmp_addr, [`addr_base+`head_start*8+#]", to_hex(addr_offset) ));
        APPEND_M(str( "MOV `res_0, [`tmp_addr]" ));
        APPEND_M(str( "MOV `res_1, [`tmp_addr+8]" ));
        APPEND_M(str( "MOV `res_2, [`tmp_addr+16]" ));
    }

    void mul_add_bmi(
        reg_alloc regs, asm_integer a, reg_scalar b, asm_integer c, bool invert_output, bool carry_in_is_1
    ) {
        EXPAND_MACROS_SCOPE;

        m.bind(b, "b");

        //5x scalar
        reg_scalar mul_low_0=regs.bind_scalar(m, "mul_low_0");
        reg_scalar mul_low_1=regs.bind_scalar(m, "mul_low_1");
        reg_scalar mul_high_0=regs.bind_scalar(m, "mul_high_0");
        reg_scalar mul_high_1=regs.bind_scalar(m, "mul_high_1");
        reg_scalar rdx=regs.bind_scalar(m, "rdx", reg_rdx);

        //clears OF and CF
        APPEND_M(str( "XOR RDX, RDX" ));

        if (carry_in_is_1) {
            APPEND_M(str( "STC" ));
        }

        APPEND_M(str( "MOV RDX, `b" ));

        for (int pos=0;pos<size;pos+=2) {
            bool first=(pos==0);

            //mul_low=mul_low+mul_high>>64
            APPEND_M(str( "MULX `mul_high_0, `mul_low_0, #", a[pos] ));

            if (!first) {
                APPEND_M(str( "ADOX `mul_low_0, `mul_high_1" ));
            }

            APPEND_M(str( "MULX `mul_high_1, `mul_low_1, #", a[pos+1] ));
            APPEND_M(str( "ADOX `mul_low_1, `mul_high_0" ));

            if (!c.is_null()) {
                APPEND_M(str( "ADCX `mul_low_0, #", c[pos] ));
                APPEND_M(str( "ADCX `mul_low_1, #", c[pos+1] ));
            }

            if (invert_output) {
                APPEND_M(str( "NOT `mul_low_0" ));
                APPEND_M(str( "NOT `mul_low_1" ));
            }

            APPEND_M(str( "MOV #, `mul_low_0", (*this)[pos] ));
            APPEND_M(str( "MOV #, `mul_low_1", (*this)[pos+1] ));
        }
    }

    void mul_add_slow(
        reg_alloc regs, asm_integer a, reg_scalar b, asm_integer c, bool invert_output, bool carry_in_is_1
    ) {
        EXPAND_MACROS_SCOPE;

        m.bind(b, "b");

        //11x scalar
        reg_scalar mul_carry=regs.bind_scalar(m, "mul_carry");
        reg_scalar add_carry=regs.bind_scalar(m, "add_carry");
        reg_scalar mul_high_4_previous=regs.bind_scalar(m, "mul_high_4_previous");
        reg_scalar mul_low_0=regs.bind_scalar(m, "mul_low_0");
        reg_scalar mul_low_1=regs.bind_scalar(m, "mul_low_1");
        reg_scalar mul_low_2=regs.bind_scalar(m, "mul_low_2");
        reg_scalar mul_low_3=regs.bind_scalar(m, "mul_low_3", reg_rax);
        reg_scalar mul_high_0=regs.bind_scalar(m, "mul_high_0");
        reg_scalar mul_high_1=regs.bind_scalar(m, "mul_high_1");
        reg_scalar mul_high_2=regs.bind_scalar(m, "mul_high_2");
        reg_scalar mul_high_3=regs.bind_scalar(m, "mul_high_3", reg_rdx);

        for (int pos=0;pos<size;pos+=4) {
            bool first=(pos==0);
            bool last=(pos==size-4);

            //multiply 4 values of a by b
            for (int x=0;x<4;++x) {
                //mul_low_3=RAX
                //mul_high_3=RDX
                APPEND_M(str( "MOV RAX, `b" ));
                APPEND_M(str( "MUL QWORD PTR #", a[pos+x] ));

                if (x==3) {
                    assert(mul_low_3.value==reg_rax.value);
                    assert(mul_high_3.value==reg_rdx.value);
                } else {
                    APPEND_M(str( "MOV `mul_low_#, RAX", x ));
                    APPEND_M(str( "MOV `mul_high_#, RDX", x ));
                }
            }

            //mul_low=mul_low+mul_high>>64
            if (first) {
                //mul_carry==0 ; mul_high_4_previous==0
                APPEND_M(str( "ADD `mul_low_1, `mul_high_0" ));
            } else {
                APPEND_M(str( "ADD `mul_carry, 1" )); // CF=(mul_carry==-1)? 1 : 0
                APPEND_M(str( "ADC `mul_low_0, `mul_high_4_previous" ));
                APPEND_M(str( "ADC `mul_low_1, `mul_high_0" ));
            }

            APPEND_M(str( "ADC `mul_low_2, `mul_high_1" ));
            APPEND_M(str( "ADC `mul_low_3, `mul_high_2" ));

            if (!last) {
                APPEND_M(str( "MOV `mul_high_4_previous, `mul_high_3" ));
                APPEND_M(str( "SBB `mul_carry, `mul_carry" )); // mul_carry=(CF)? -1 : 0
            }

            if (!c.is_null()) {
                //mul_low=mul_low+c
                //output mul_low

                if (first) {
                    if (carry_in_is_1) {
                        APPEND_M(str( "STC" ));
                        APPEND_M(str( "ADC `mul_low_0, #", c[pos] ));
                    } else {
                        APPEND_M(str( "ADD `mul_low_0, #", c[pos] ));
                    }
                } else {
                    APPEND_M(str( "ADD `add_carry, 1" )); // CF=(add_carry==-1)? 1 : 0
                    APPEND_M(str( "ADC `mul_low_0, #", c[pos] ));
                }

                for (int x=1;x<4;++x) {
                    APPEND_M(str( "ADC `mul_low_#, #", x, c[pos+x] ));
                }

                if (!last) {
                    APPEND_M(str( "SBB `add_carry, `add_carry" )); // add_carry=(CF)? -1 : 0
                }
            }

            for (int x=0;x<4;++x) {
                if (invert_output) {
                    APPEND_M(str( "NOT `mul_low_#", x ));
                }
                APPEND_M(str( "MOV #, `mul_low_#", (*this)[pos+x], x ));
            }
        }
    }

    // (*this)=a*b+c+(carry_in_is_1? 1 : 0)
    // if (invert_output) (*this)=~(*this)
    //all of the integers must have the same size (which is a multiple of 4)
    //a or c can alias with *this (as long as the aliasing is not partial)
    //regs: 11x scalar
    //
    //to calculate a*b-c*d:
    //-first calculate ~(c*d)
    //-then calculate a*b+(~(c*d))+1
    void mul_add(
        const reg_alloc& regs, asm_integer a, reg_scalar b, asm_integer c, bool invert_output, bool carry_in_is_1
    ) {
        EXPAND_MACROS_SCOPE;

        assert(!carry_in_is_1 || !c.is_null());
        assert(size%4==0);
        assert(size==a.size && (c.is_null() || size==c.size));

        if (enable_all_instructions) {
            mul_add_bmi(regs, a, b, c, invert_output, carry_in_is_1);
        } else {
            mul_add_slow(regs, a, b, c, invert_output, carry_in_is_1);
        }
    }
};

//sets res to the right shift amount required for the uppermost limb to be 0. this is between 0 and 64 inclusive
//regs: 1x scalar
void calculate_shift_amount(reg_alloc regs, array<reg_scalar, 3> limbs, reg_scalar res) {
    EXPAND_MACROS_SCOPE;

    m.bind(limbs, "limbs");
    m.bind(res, "res");

    reg_scalar tmp=regs.bind_scalar(m, "tmp");

    //res=[first set bit index in limbs_2]+1
    APPEND_M(str( "BSR `res, `limbs_2" ));
    APPEND_M(str( "INC `res" ));

    //res=num bits of limbs_2 [which is also the right shift amount]
    //(this is 0 if limbs_2 is 0)
    APPEND_M(str( "XOR `tmp, `tmp" ));
    APPEND_M(str( "CMP `limbs_2, `tmp" ));
    APPEND_M(str( "CMOVE `res, `tmp" ));
}

//amount must be >=0 and <=64
//this only calculates the lower 2 limbs of the result
//regs: 1x scalar
//in-place
void shift_right(reg_alloc regs, array<reg_scalar, 3> limbs, reg_scalar amount) {
    EXPAND_MACROS_SCOPE;

    m.bind(limbs, "limbs");
    m.bind(amount, "amount");

    regs.get_scalar(reg_rcx);

    APPEND_M(str( "MOV RCX, `amount" ));

    // if (amount<64) res[0]=[limbs[1]:limbs[0]]>>amount
    // if (amount==64) no-op
    APPEND_M(str( "SHRD `limbs_0, `limbs_1, CL" ));

    // if (amount<64) res[1]=[limbs[2]:limbs[1]]>>amount
    // if (amount==64) no-op
    APPEND_M(str( "SHRD `limbs_1, `limbs_2, CL" ));

    APPEND_M(str( "CMP `amount, 64" ));
    APPEND_M(str( "CMOVE `limbs_0, `limbs_1" ));
    APPEND_M(str( "CMOVE `limbs_1, `limbs_2" ));
}

//this must be true: a>=b; a>=threshold
//
//all of the integers should have spilled addresses with offsets of 0. all of their sizes should be the same
//the input a and b values should go into spill_a and spill_b. spill_a_2 and spill_b_2 should be uninitialized
//spill_iter will be between -1 and max_iterations
//the final a value is in spill_a if spill_iter is odd, otherwise is is in a_2. same with b
//
//for each iteration, including iteration -1, the following will happen:
//-64 bytes of data is written to *(spill_out_uv_addr + iter*64)
//-then, *spill_uv_counter_addr is set to spill_uv_counter_start+iter
//
//the data has the following format: [u0] [u1] [v0] [v1] [parity] [exit_flag]
//-each entry is 8 bytes
//-if iter is -1, only exit_flag is initialized and the rest have undefined values
//-if exit_flag is 1, this is the final result
//
//no more than max_iterations+1 results will be outputted. there will be an error if there are more results than this
//(this includes iteration -1)
//
//spill_a_end_index must be < a's size and >= 0. any limbs past this must be 0 for a, b, and threshold, but only up to the next
// multiple of 4 limbs. (e.g. if spill_a_end_index is 6, there are 7 limbs so the 8th limb must be 0 and the rest can be uninitialized)
//
//the return value of iter is the total number of iterations performed, which is at least 0. iter-1 is the parity of the last iteration
void gcd_unsigned(
    reg_alloc regs_parent,
    asm_integer spill_a, asm_integer spill_b, asm_integer spill_a_2, asm_integer spill_b_2, asm_integer spill_threshold,
    reg_spill spill_uv_counter_start, reg_spill spill_out_uv_counter_addr, reg_spill spill_out_uv_addr,
    reg_spill spill_iter, reg_spill spill_a_end_index, int max_iterations
) {
    EXPAND_MACROS_SCOPE_PUBLIC;

    track_asm( "gcd_unsigned" );

    int int_size=spill_a.size;
    assert(spill_a.addr_offset==0 && spill_b.addr_offset==0 && spill_threshold.addr_offset==0);
    assert(spill_a.addr_base.value==-1 && spill_b.addr_base.value==-1 && spill_threshold.addr_base.value==-1);
    assert(spill_a_2.addr_offset==0 && spill_b_2.addr_offset==0);
    assert(spill_a_2.addr_base.value==-1 && spill_b_2.addr_base.value==-1);
    assert(spill_a.size==int_size && spill_b.size==int_size && spill_threshold.size==int_size);
    assert(spill_a_2.size==int_size && spill_b_2.size==int_size);

    m.bind(spill_a.addr_base_spill, "spill_a_addr_base");
    m.bind(spill_a_2.addr_base_spill, "spill_a_2_addr_base");

    m.bind(spill_b.addr_base_spill, "spill_b_addr_base");
    m.bind(spill_b_2.addr_base_spill, "spill_b_2_addr_base");

    m.bind(spill_threshold.addr_base_spill, "spill_threshold_addr_base");

    m.bind(spill_iter, "spill_iter");
    m.bind(spill_uv_counter_start, "spill_uv_counter_start");
    m.bind(spill_out_uv_addr, "spill_out_uv_addr");
    m.bind(spill_out_uv_counter_addr, "spill_out_uv_counter_addr");
    m.bind(spill_a_end_index, "spill_a_end_index");

    reg_spill spill_u_0=regs_parent.bind_spill(m, "spill_u_0");
    reg_spill spill_u_1=regs_parent.bind_spill(m, "spill_u_1");
    reg_spill spill_v_0=regs_parent.bind_spill(m, "spill_v_0");
    reg_spill spill_v_1=regs_parent.bind_spill(m, "spill_v_1");
    reg_spill spill_parity=regs_parent.bind_spill(m, "spill_parity");
    reg_spill spill_is_lehmer=regs_parent.bind_spill(m, "spill_is_lehmer");

    reg_spill spill_a_128=regs_parent.bind_spill(m, "spill_a_128", 16, 8);
    reg_spill spill_b_128=regs_parent.bind_spill(m, "spill_b_128", 16, 8);
    reg_spill spill_threshold_128=regs_parent.bind_spill(m, "spill_threshold_128", 16, 8);

    m.bind(spill_a_128+8, "spill_a_128_8");
    m.bind(spill_b_128+8, "spill_b_128_8");
    m.bind(spill_threshold_128+8, "spill_threshold_128_8");

    APPEND_M(str( "MOV QWORD PTR `spill_iter, -1" ));

    string loop_start=m.alloc_label();
    string loop=m.alloc_label();
    string loop_exit=m.alloc_label();

    APPEND_M(str( "JMP #", loop_start ));

    APPEND_M(str( "#:", loop ));

    //iter even: old_a=a  , old_b=b   ; new_a=a_2, new_b=b_2
    //iter odd:  old_a=a_2, old_b=b_2 ; new_a=a  , new_b=b

    gcd_128(
        regs_parent,
        {spill_a_128, spill_b_128}, {spill_u_0, spill_u_1}, {spill_v_0, spill_v_1},
        spill_parity, spill_is_lehmer, spill_threshold_128,
        track_asm( "gcd_unsigned error: gcd 128 stuck", m.alloc_error_label() )
    );

    string exit_multiply_uv=m.alloc_label();

    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        reg_scalar tmp=regs.bind_scalar(m, "tmp");

        string jump_table_label=m.alloc_label();

#ifdef CHIAOSX
        APPEND_M(str( ".text " ));
#else
        APPEND_M(str( ".text 1" ));
#endif
        APPEND_M(str( ".balign 8" ));
        APPEND_M(str( "#:", jump_table_label ));

#ifdef CHIAOSX
        APPEND_M(str( ".text" ));

        APPEND_M(str( "MOV `tmp, `spill_a_end_index" ));

        for (int end_index=0;end_index<int_size;++end_index) {
            int size=end_index+1;

            int mapped_size=size;
            while (mapped_size==0 || mapped_size%4!=0) {
                ++mapped_size;
            }

            APPEND_M(str( "CMP `tmp, #", size ));

            APPEND_M(str( "JE ")+asmprefix+str("multiply_uv_size_#", mapped_size ));
        }
#else
        for (int end_index=0;end_index<int_size;++end_index) {
            int size=end_index+1;

            int mapped_size=size;
            while (mapped_size==0 || mapped_size%4!=0) {
                ++mapped_size;
            }

            APPEND_M(str( ".quad ")+asmprefix+str("multiply_uv_size_#", mapped_size ));
        }
        APPEND_M(str( ".text" ));

        APPEND_M(str( "MOV `tmp, `spill_a_end_index" ));
        APPEND_M(str( "JMP QWORD PTR [#+`tmp*8]", jump_table_label ));
#endif
    }
    for (int size=4;size<=int_size;size+=4) {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        APPEND_M(asmprefix+str( "multiply_uv_size_#:", size ));

        track_asm(str( "gcd_unsigned multiply uv size #", size ));

        //reg_scalar t=regs.bind_scalar(m, "t");

        // even:
        // new_a=a*u_0 - b*v_0;
        // new_a=b*v_1 - a*u_1;
        //
        // tmp0=b*v_0
        // tmp1=a*u_1
        // new_a=a*u_0 - tmp0
        // new_b=b*v_1 - tmp1
        //
        // odd:
        // new_a=b*v_0 - a*u_0;
        // new_b=a*u_1 - b*v_1;
        //
        // tmp0=a*u_0
        // tmp1=b*v_1
        // new_a=b*v_0 - tmp0
        // new_b=a*u_1 - tmp1
        //
        // in general:
        // tmp0=(even?b:a)*(even?v_0:u_0)
        // tmp1=(even?a:b)*(even?u_1:v_1)
        // new_a=(even?a:b)*(even?u_0:v_0) - tmp0
        // new_b=(even?b:a)*(even?v_1:u_1) - tmp1

        reg_scalar addr_a=regs.bind_scalar(m, "addr_a");
        reg_scalar addr_b=regs.bind_scalar(m, "addr_b");
        reg_scalar addr_new=regs.bind_scalar(m, "addr_new");
        reg_scalar tmp=regs.bind_scalar(m, "tmp");

        reg_spill spill_mod_u_0=regs.bind_spill(m, "spill_mod_u_0");
        reg_spill spill_mod_u_1=regs.bind_spill(m, "spill_mod_u_1");
        reg_spill spill_mod_v_0=regs.bind_spill(m, "spill_mod_v_0");
        reg_spill spill_mod_v_1=regs.bind_spill(m, "spill_mod_v_1");

        reg_spill spill_addr_b_new=regs.bind_spill(m, "spill_addr_b_new");

        APPEND_M(str( "MOV `tmp, `spill_parity" ));
        APPEND_M(str( "CMP `tmp, 0" ));

        for (int x=0;x<2;++x) {
            APPEND_M(str( "MOV `addr_a, `spill_u_#", x ));
            APPEND_M(str( "MOV `addr_b, `spill_v_#", x ));

            //if (spill_parity!=0) swap(u[x], v[x])
            APPEND_M(str( "MOV `addr_new, `addr_a" ));
            APPEND_M(str( "CMOVNE `addr_a, `addr_b" ));
            APPEND_M(str( "CMOVNE `addr_b, `addr_new" ));

            APPEND_M(str( "MOV `spill_mod_u_#, `addr_a", x ));
            APPEND_M(str( "MOV `spill_mod_v_#, `addr_b", x ));
        }

        APPEND_M(str( "MOV `addr_new, `spill_iter" ));
        APPEND_M(str( "TEST `addr_new, 1" )); // ZF=even iteration

        //addr_a=(even iteration)? &a : &a_2
        APPEND_M(str( "MOV `addr_a, `spill_a_addr_base" ));
        APPEND_M(str( "CMOVNZ `addr_a, `spill_a_2_addr_base" ));

        //addr_b=(even iteration)? &b : &b_2
        APPEND_M(str( "MOV `addr_b, `spill_b_addr_base" ));
        APPEND_M(str( "CMOVNZ `addr_b, `spill_b_2_addr_base" ));

        //if (spill_parity!=0) swap(addr_a, addr_b)
        APPEND_M(str( "CMP `tmp, 0" ));
        APPEND_M(str( "MOV `addr_new, `addr_a" ));
        APPEND_M(str( "CMOVNE `addr_a, `addr_b" ));
        APPEND_M(str( "CMOVNE `addr_b, `addr_new" ));

        //done using tmp (spill_parity)

        //spill_addr_b_new=(even iteration)? &b_2 : &b
        APPEND_M(str( "MOV `addr_new, `spill_iter" ));
        APPEND_M(str( "TEST `addr_new, 1" )); // ZF=even iteration
        APPEND_M(str( "MOV `addr_new, `spill_b_2_addr_base" ));
        APPEND_M(str( "CMOVNZ `addr_new, `spill_b_addr_base" ));
        APPEND_M(str( "MOV `spill_addr_b_new, `addr_new" ));

        //addr_new=(even iteration)? &a_2 : &a
        APPEND_M(str( "MOV `addr_new, `spill_a_2_addr_base" ));
        APPEND_M(str( "CMOVNZ `addr_new, `spill_a_addr_base" ));

        //this can be a, a_2, b, or b_2 depending on iter and parity
        asm_integer a;
        a.size=int_size;
        a.addr_base=addr_a;

        asm_integer b;
        b.size=int_size;
        b.addr_base=addr_b;

        //initially new_a
        asm_integer new_ab;
        new_ab.size=int_size;
        new_ab.addr_base=addr_new;

        reg_spill tmp0_spill=regs.get_spill(int_size*8, 8);
        asm_integer tmp0;
        tmp0.size=int_size;
        tmp0.addr_base=reg_rsp;
        tmp0.addr_offset=tmp0_spill.get_rsp_offset();

        reg_spill tmp1_spill=regs.get_spill(int_size*8, 8);
        asm_integer tmp1;
        tmp1.size=int_size;
        tmp1.addr_base=reg_rsp;
        tmp1.addr_offset=tmp1_spill.get_rsp_offset();

        // tmp0=(even?b:a)*(even?v_0:u_0)
        APPEND_M(str( "MOV `tmp, `spill_mod_v_0" ));
        tmp0.mul_add(regs, b, tmp, asm_integer(), true, false);

        // tmp1=(even?a:b)*(even?u_1:v_1)
        APPEND_M(str( "MOV `tmp, `spill_mod_u_1" ));
        tmp1.mul_add(regs, a, tmp, asm_integer(), true, false);

        // new_a=(even?a:b)*(even?u_0:v_0) - tmp0
        APPEND_M(str( "MOV `tmp, `spill_mod_u_0" ));
        new_ab.mul_add(regs, a, tmp, tmp0, false, true);

        // new_b=(even?b:a)*(even?v_1:u_1) - tmp1
        APPEND_M(str( "MOV `addr_new, `spill_addr_b_new" ));
        APPEND_M(str( "MOV `tmp, `spill_mod_v_1" ));
        new_ab.mul_add(regs, b, tmp, tmp1, false, true);

        APPEND_M(str( "JMP #", exit_multiply_uv ));
    }
    APPEND_M(str( "#:", exit_multiply_uv ));

    //8x
    reg_scalar iter=regs_parent.bind_scalar(m, "iter");
    reg_scalar is_lehmer=regs_parent.bind_scalar(m, "is_lehmer");
    reg_scalar a_head_0=regs_parent.bind_scalar(m, "a_head_0");
    reg_scalar a_head_1=regs_parent.bind_scalar(m, "a_head_1");
    reg_scalar b_head_0=regs_parent.bind_scalar(m, "b_head_0");
    reg_scalar b_head_1=regs_parent.bind_scalar(m, "b_head_1");
    reg_scalar a_head_start=regs_parent.bind_scalar(m, "a_head_start");
    reg_scalar shift_right_amount=regs_parent.bind_scalar(m, "shift_right_amount");

    APPEND_M(str( "#:", loop_start ));
    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //6x + 3x from called functions
        reg_scalar addr_a=regs.bind_scalar(m, "addr_a", reg_rax);
        reg_scalar addr_b=regs.bind_scalar(m, "addr_b", reg_rdx);
        reg_scalar b_head_2=regs.bind_scalar(m, "b_head_2");
        reg_scalar a_head_2=regs.bind_scalar(m, "a_head_2");

        APPEND_M(str( "MOV `iter, `spill_iter" ));

        //addr_a=(even iteration)? &a_2 : &a
        APPEND_M(str( "TEST `iter, 1" )); // ZF=even iteration
        APPEND_M(str( "MOV `addr_a, `spill_a_2_addr_base" ));
        APPEND_M(str( "CMOVNZ `addr_a, `spill_a_addr_base" ));

        //addr_b=(even iteration)? &b_2 : &b
        APPEND_M(str( "MOV `addr_b, `spill_b_2_addr_base" ));
        APPEND_M(str( "CMOVNZ `addr_b, `spill_b_addr_base" ));

        asm_integer a;
        a.size=int_size;
        a.addr_base=addr_a;

        asm_integer b;
        b.size=int_size;
        b.addr_base=addr_b;

        APPEND_M(str( "MOV `a_head_start, `spill_a_end_index" ));
        a.update_end_index(regs, a_head_start);
        APPEND_M(str( "MOV `spill_a_end_index, `a_head_start" ));

        //is_lehmer=(a_end_index>=2)
        //(a_end_index is stored in a_head_start)
        APPEND_M(str( "XOR `is_lehmer, `is_lehmer" ));
        APPEND_M(str( "CMP `a_head_start, 2" ));
        APPEND_M(str( "SETAE `is_lehmer_8" ));
        APPEND_M(str( "MOV `spill_is_lehmer, `is_lehmer" ));

        a.calculate_head_start(regs, a_head_start);

        a.extract_head_at(regs, a_head_start, {a_head_0, a_head_1, a_head_2});
        calculate_shift_amount(regs, {a_head_0, a_head_1, a_head_2}, shift_right_amount);
        shift_right(regs, {a_head_0, a_head_1, a_head_2}, shift_right_amount);

        b.extract_head_at(regs, a_head_start, {b_head_0, b_head_1, b_head_2});
        shift_right(regs, {b_head_0, b_head_1, b_head_2}, shift_right_amount);

        APPEND_M(str( "MOV `spill_a_128, `a_head_0" ));
        APPEND_M(str( "MOV `spill_a_128_8, `a_head_1" ));

        APPEND_M(str( "MOV `spill_b_128, `b_head_0" ));
        APPEND_M(str( "MOV `spill_b_128_8, `b_head_1" ));
    }

    //9x
    //iter, is_lehmer, b_head_0, b_head_1, a_head_start, shift_right_amount
    reg_scalar exit_flag=regs_parent.bind_scalar(m, "exit_flag");

    //clobbers is_lehmer
    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //4x + 1x from called functions
        reg_scalar addr_threshold=regs.bind_scalar(m, "addr_threshold", reg_rax);
        reg_scalar threshold_head_0=regs.bind_scalar(m, "threshold_head_0", reg_rdx);
        reg_scalar threshold_head_1=regs.bind_scalar(m, "threshold_head_1");
        reg_scalar threshold_head_2=regs.bind_scalar(m, "threshold_head_2");

        //addr_threshold=&threshold
        APPEND_M(str( "MOV `addr_threshold, `spill_threshold_addr_base" ));

        asm_integer threshold;
        threshold.size=int_size;
        threshold.addr_base=addr_threshold;

        threshold.extract_head_at(regs, a_head_start, {threshold_head_0, threshold_head_1, threshold_head_2});
        shift_right(regs, {threshold_head_0, threshold_head_1, threshold_head_2}, shift_right_amount);

        APPEND_M(str( "MOV `spill_threshold_128, `threshold_head_0" ));
        APPEND_M(str( "MOV `spill_threshold_128_8, `threshold_head_1" ));

        //if (a_head<=threshold_head) goto error
        APPEND_M(str( "MOV `addr_threshold, `threshold_head_0" ));
        APPEND_M(str( "MOV `threshold_head_2, `threshold_head_1" ));
        APPEND_M(str( "SUB `addr_threshold, `a_head_0" ));
        APPEND_M(str( "SBB `threshold_head_2, `a_head_1" ));
        APPEND_M(str( "JNC #", track_asm( "gcd_unsigned error: a_head<=threshold_head", m.alloc_error_label() ) ));

        //threshold_head' = threshold_head-b_head
        APPEND_M(str( "XOR `exit_flag, `exit_flag" ));
        APPEND_M(str( "SUB `threshold_head_0, `b_head_0" ));
        APPEND_M(str( "SBB `threshold_head_1, `b_head_1" ));
        APPEND_M(str( "SETNC `exit_flag_8" )); //exit_flag = (threshold_head>=b_head)

        //if (b_head==threshold_head && is_lehmer) goto error
        APPEND_M(str( "OR `threshold_head_0, `threshold_head_1" ));
        APPEND_M(str( "DEC `is_lehmer" )); // is_lehmer'=(is_lehmer)? 0 : ~0
        APPEND_M(str( "OR `threshold_head_0, `is_lehmer" )); //ZF = (threshold_head'==0 && is_lehmer)
        APPEND_M(str( "JZ #", track_asm( "gcd_unsigned error: b_head==threshold_head and is_lehmer", m.alloc_error_label() ) ));
    }

    //9x

    {
        EXPAND_MACROS_SCOPE;
        reg_alloc regs=regs_parent;

        //2x
        reg_scalar out_uv_addr=regs.bind_scalar(m, "out_uv_addr");
        reg_scalar tmp=regs.bind_scalar(m, "tmp");

        //out_uv_addr = spill_out_uv_addr + iter*64
        //note: iter can be -1
        APPEND_M(str( "MOV `out_uv_addr, `iter" ));
        APPEND_M(str( "SHL `out_uv_addr, 6" ));
        APPEND_M(str( "ADD `out_uv_addr, `spill_out_uv_addr" ));

        APPEND_M(str( "MOV `tmp, `spill_u_0" ));
        APPEND_M(str( "MOV [`out_uv_addr], `tmp" ));

        APPEND_M(str( "MOV `tmp, `spill_u_1" ));
        APPEND_M(str( "MOV [`out_uv_addr+8], `tmp" ));

        APPEND_M(str( "MOV `tmp, `spill_v_0" ));
        APPEND_M(str( "MOV [`out_uv_addr+16], `tmp" ));

        APPEND_M(str( "MOV `tmp, `spill_v_1" ));
        APPEND_M(str( "MOV [`out_uv_addr+24], `tmp" ));

        APPEND_M(str( "MOV `tmp, `spill_parity" ));
        APPEND_M(str( "MOV [`out_uv_addr+32], `tmp" ));

        APPEND_M(str( "MOV [`out_uv_addr+40], `exit_flag" ));

        //done assigning the data; can now increment the counter. this is not atomic because only this thread can write to the counter
        //(the counter must be 8-aligned)
        //x86 uses acq_rel ordering on all of the loads and stores so no fences are required
        APPEND_M(str( "MOV `tmp, `spill_uv_counter_start" ));
        APPEND_M(str( "ADD `tmp, `iter" ));
        APPEND_M(str( "MOV `out_uv_addr, `spill_out_uv_counter_addr" ));
        APPEND_M(str( "MOV [`out_uv_addr], `tmp" ));

        APPEND_M(str( "INC `iter" ));
        APPEND_M(str( "MOV `spill_iter, `iter" ));

        APPEND_M(str( "CMP `exit_flag, 0" ));
        APPEND_M(str( "JNE #", loop_exit ));

        APPEND_M(str( "CMP `iter, #", to_hex(max_iterations) )); //signed
        APPEND_M(str( "JGE #", track_asm( "gcd_unsigned error: max_iterations exceeded", m.alloc_error_label() ) ));
    }

    APPEND_M(str( "JMP #", loop ));

    APPEND_M(str( "#:", loop_exit ));
}


}