24 December 2011

Specializations of general banded matrix-vector multiplication

I keep needing mixed real/complex level 2 BLAS operations for real-valued, general banded matrices. It's simple to modify Netlib's dgbmv for mixed real/complex data but the result always feels unsatisfactory. Partially because I'd like to avoid a Fortran dependency in my project. Partially because I'd like to build kernels for the special case where I know my matrix's upper and lower bandwidth (as optimizers can rip there according to fellow student Myoungkyu Lee).

I finally got off (err... on?) my duff and reworked dgbmv. The result is a snippet which can be #included from a C99 source file where the proper defines turn it into a real, complex, or mixed real/complex routine. I've uglified it sufficiently that both GNU and Intel's compiler's inform me at -O3 that they've vectorized the loops handling both A and A**T cases for contiguous vectors.

// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define GBMV_STATIC    /*empty*/         /* Use for static function    */
// #define GBMV_FUNCTION  dgbmv             /* Function name              */
// #define GBMV_COMPONENT double            /* Type of matrices           */
// #define GBMV_SCALAR    double            /* Type of coeffs and vectors */
// #define GBMV_KL        const int kl,     /* Matrix lower bandwidth     */
// #define GBMV_KU        const int ku,     /* Matrix upper bandwidth     */

#ifndef __GBMV_INTERNAL_ONCE
#define __GBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __GBMV_INTERNAL_ONCE */

GBMV_STATIC int GBMV_FUNCTION(
    char trans, const int m, const int n, GBMV_KL GBMV_KU
    const GBMV_SCALAR alpha, const GBMV_COMPONENT *       restrict a, int lda,
                             const    GBMV_SCALAR * const restrict x, const int incx,
    const GBMV_SCALAR beta,           GBMV_SCALAR * const restrict y, const int incy)
{
    // Logic from http://www.netlib.org/blas/dgbmv.f revised
    // for zero-indexed, vectorization-ready code on GCC and Intel.

    trans = toupper(trans);  // Simplifies case-insensitive checks

    // Test the input parameters
    if (trans != 'N' && trans != 'T' && trans != 'C') {
        return 1;
    } else if (m < 0) {
        return 2;
    } else if (n < 0) {
        return 3;
    } else if (kl < 0) {
        return 4;
    } else if (ku < 0) {
        return 5;
    } else if (lda <= kl + ku) {
        return 8;
    } else if (incx == 0) {
        return 10;
    } else if (incy == 0) {
        return 13;
    }

    // Quick return if possible
    if (m == 0 || n == 0 || (alpha == 0 && beta == 1)) {
        return 0;
    }

    // Set the lengths of the vectors x and y.
    const int lenx = trans == 'N' ? n : m;
    const int leny = trans == 'N' ? m : n;

    // Start the operations. In this version the elements of A are accessed
    // sequentially with one pass through the band part of A.

    // First form y := beta*y (ignoring the irrelevant incy sign)...
    if (beta != 1) {
        const int abs_incy = abs(incy);
        if (abs_incy == 1) {  // ...for contiguous y
            if (beta == 0) {
                memset(y, 0, leny*sizeof(y[0]));
            } else {
                for (int i = 0; i < leny; ++i) {
                    y[i] *= beta;
                }
            }
        } else {              // ...for strided y
            if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < leny; ++i) {
                    y[i*abs_incy] = 0;
                }
            } else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < leny; ++i) {
                    y[i*abs_incy] *= beta;
                }
            }
        }
    }

    // Quick return when the matrix used is irrelevant
    if (alpha == 0) {
        return 0;
    }

    // Set up the start points in x and y.
    int kx = incx > 0 ? 0 : incx*(1 - lenx);
    int ky = incy > 0 ? 0 : incy*(1 - leny);

    // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
    // Incorporate the ku offset and decrement lda to speed indexing in loops.
    a += ku;
    --lda;

    // Perform the banded matrix-vector accumulation
    const int klp1 = kl + 1;

    if (trans == 'N') {   // Form y := alpha*A*x + y...

        int jx = kx;
        if (incy == 1) {  // ...for contiguous y
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                const GBMV_SCALAR temp = alpha*x[jx];
                for (int i = il; i < iu; ++i) {
                    y[i] += temp*a[i];
                }
                jx += incx;
                a += lda;
            }
        } else {          // ...for strided y
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                const GBMV_SCALAR temp = alpha*x[jx];
                int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = il; i < iu; ++i) {
                    y[iy] += temp*a[i];
                    iy += incy;
                }
                jx += incx;
                if (j >= ku) {
                    ky += incy;
                }
                a += lda;
            }
        }

    } else {              // Form y := alpha*A**T*x + y...

        int jy = ky;
        if (incx == 1) {  // ...for contiguous x
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                GBMV_SCALAR temp = 0;
                for (int i = il; i < iu; ++i) {
                    temp += a[i]*x[i];
                }
                y[jy] += alpha * temp;
                jy += incy;
                a += lda;
            }
        } else {          // ...for strided x
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                GBMV_SCALAR temp = 0;
                int ix = kx;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = il; i < iu; ++i) {
                    temp += a[i]*x[ix];
                    ix += incx;
                }
                y[jy] += alpha*temp;
                jy += incy;
                if (j >= ku) {
                    kx += incx;
                }
                a += lda;
            }
        }
    }

    return 0;
}

#undef GBMV_STATIC
#undef GBMV_FUNCTION
#undef GBMV_COMPONENT
#undef GBMV_SCALAR
#undef GBMV_KL
#undef GBMV_KU

The same tricks transfer over to dsbmv as well (though the studious will notice indexing differences relative to the Netlib version):

// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define SBMV_STATIC    /*empty*/         /* Use for static function    */
// #define SBMV_FUNCTION  dsbmv             /* Function name              */
// #define SBMV_COMPONENT double            /* Type of matrices           */
// #define SBMV_SCALAR    double            /* Type of coeffs and vectors */
// #define SBMV_K         const int k,      /* Matrix bandwidth           */

#ifndef __SBMV_INTERNAL_ONCE
#define __SBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __SBMV_INTERNAL_ONCE */

SBMV_STATIC int SBMV_FUNCTION(
    char uplo, const int n, SBMV_K
    const SBMV_SCALAR alpha, const SBMV_COMPONENT *       restrict a, int lda,
                             const    SBMV_SCALAR * const restrict x, const int incx,
    const SBMV_SCALAR beta,           SBMV_SCALAR * const restrict y, const int incy)
{
    // Logic from http://www.netlib.org/blas/dsbmv.f revised
    // for zero-indexed, vectorization-ready code on GCC and Intel.

    uplo = toupper(uplo);  // Simplifies case-insensitive checks

    // Test the input parameters
    if (uplo != 'U' && uplo != 'L') {
        return 1;
    } else if (n < 0) {
        return 2;
    } else if (k < 0) {
        return 3;
    } else if (lda <= k) {
        return 6;
    } else if (incx == 0) {
        return 8;
    } else if (incy == 0) {
        return 11;
    }

    // Quick return if possible
    if (n == 0 || (alpha == 0 && beta == 1)) {
        return 0;
    }

    // Start the operations. In this version the elements of A are accessed
    // sequentially with one pass through the band part of A.

    // First form y := beta*y (ignoring the irrelevant incy sign)...
    if (beta != 1) {
        const int abs_incy = abs(incy);
        if (abs_incy == 1) {  // ...for contiguous y
            if (beta == 0) {
                memset(y, 0, n * sizeof(y[0]));
            } else {
                for (int i = 0; i < n; ++i) {
                    y[i] *= beta;
                }
            }
        } else {              // ...for strided y
            if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < n; ++i) {
                    y[i*abs_incy] = 0;
                }
            } else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < n; ++i) {
                    y[i*abs_incy] *= beta;
                }
            }
        }
    }

    // Quick return when the matrix used is irrelevant
    if (alpha == 0) {
        return 0;
    }

    // Set up the start points in x and y.
    int kx = incx > 0 ? 0 : incx*(1 - n);
    int ky = incy > 0 ? 0 : incy*(1 - n);

    // Perform the banded matrix-vector accumulation

    if (uplo == 'U') {  // Form y := alpha*A*x + y for A upper storage...

        // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
        // Incorporate the ku offset and decrement lda to speed indexing.
        a += k;
        --lda;

        if (incx == 1 && incy == 1) {  // ...for contiguous x and y
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - k);
                const SBMV_SCALAR temp1 = alpha*x[j];
                SBMV_SCALAR       temp2 = 0;
                for (int i = il; i < j; ++i) {
                    y[i]  += temp1*a[i];
                    temp2 += a[i]*x[i];
                }
                y[j] += temp1*a[j] + alpha*temp2;
                a += lda;
            }
        } else {                       // ...for strided x and/or strided y
            int jx = kx;
            int jy = ky;
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - k);
                const SBMV_SCALAR temp1 = alpha*x[jx];
                SBMV_SCALAR temp2       = 0;
                int ix = kx;
                int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = il; i < j; ++i) {
                    y[iy] += temp1*a[i];
                    temp2 += a[i]*x[ix];
                    ix += incx;
                    iy += incy;
                }
                y[jy] += temp1*a[j] + alpha*temp2;
                jx += incx;
                jy += incy;
                if (j >= k) {
                    kx += incx;
                    ky += incy;
                }
                a += lda;
            }
        }

    } else {            // Form y := alpha*A*x + y for A lower storage...

        // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)]
        // where here ku == 0.  Decrement lda to speed indexing.
        --lda;

        const int kp1 = k + 1;
        if (incx == 1 && incy == 1) {  // ...for contiguous x and y
            for (int j = 0; j < n; ++j) {
                const int iu = ndxmin(n, j + kp1);
                const SBMV_SCALAR temp1 = alpha*x[j];
                SBMV_SCALAR       temp2 = 0;
                y[j] += temp1*a[j];
                for (int i = j + 1; i < iu; ++i) {
                    y[i]  += temp1*a[i];
                    temp2 += a[i]*x[i];
                }
                y[j] += alpha*temp2;
                a += lda;
            }
        } else {                       // ...for strided x and/or strided y
            int jx = kx;
            int jy = ky;
            for (int j = 0; j < n; ++j) {
                const int iu = ndxmin(n, j + kp1);
                const SBMV_SCALAR temp1 = alpha*x[jx];
                SBMV_SCALAR temp2       = 0;
                y[jy] += temp1*a[j];
                int ix = jx;
                int iy = jy;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = j + 1; i < iu; ++i) {
                    ix += incx;
                    iy += incy;
                    y[iy] += temp1*a[i];
                    temp2 += a[i]*x[ix];
                }
                y[jy] += alpha*temp2;
                jx += incx;
                jy += incy;
                a += lda;
            }
        }

    }

    return 0;
}

#undef SBMV_STATIC
#undef SBMV_FUNCTION
#undef SBMV_COMPONENT
#undef SBMV_SCALAR
#undef SBMV_K

Hopefully these will be useful to someone else.

Update 11 January 2012:Of course, now that I've found the reference XBLAS, pretty much everything here could (should?) have been done within XBLAS' macro framework. XBLAS definitely a cleaner mixed precision approach.

No comments:

Subscribe Subscribe to The Return of Agent Zlerich