`define N_BITS 16 // word length

// A realistic implementation of "GCD fast", literally using only 2 subtraction units.
module gcd_fast_impl (x, y, out, valid);
    output [`N_BITS-1:0] out;
    output valid;

    input [`N_BITS-2:0] x, y;  // must be positive, so 1 bit less

    reg _init;
    reg [`N_BITS-1:0] _x, _y;

    wire [`N_BITS-1:0] xmy, ymx;  // x-y, y-x
    assign xmy = _x - _y;
    assign ymx = _y - _x;

    wire xmy_neg, ymx_neg;  // x-y < 0, y-x < 0
    assign xmy_neg = xmy[`N_BITS-1];
    assign ymx_neg = ymx[`N_BITS-1];

    wire x_eq_y;  // x == y
    assign x_eq_y = ~xmy_neg & ~ymx_neg;

    assign valid = _init & x_eq_y;
    assign out = _x;

    initial begin
        _init = 0;
    end

    always @($global_clock) begin
        if (_init) begin
            _x <= ymx_neg ? xmy : _x;
            _y <= xmy_neg ? ymx : _y;
        end else begin
            _x <= {0, x};
            _y <= {0, y};
        end
        _init <= 1;
    end
endmodule
