Specializations of general banded matrix-vector multiplication
I keep needing mixed real/complex level 2 BLAS operations for real-valued, general banded matrices. It's simple to modify Netlib's dgbmv for mixed real/complex data but the result always feels unsatisfactory. Partially because I'd like to avoid a Fortran dependency in my project. Partially because I'd like to build kernels for the special case where I know my matrix's upper and lower bandwidth (as optimizers can rip there according to fellow student Myoungkyu Lee).
I finally got off (err... on?) my duff and reworked dgbmv. The result is a snippet which can be #include
d from a C99 source file where the proper defines turn it into a real, complex, or mixed real/complex routine. I've uglified it sufficiently that both GNU and Intel's compiler's inform me at -O3
that they've vectorized the loops handling both A
and A**T
cases for contiguous vectors.
// #include-time parameters available followed by a sample usage. // Each of these are #undef-ed at the end of this template file // // #define GBMV_STATIC /*empty*/ /* Use for static function */ // #define GBMV_FUNCTION dgbmv /* Function name */ // #define GBMV_COMPONENT double /* Type of matrices */ // #define GBMV_SCALAR double /* Type of coeffs and vectors */ // #define GBMV_KL const int kl, /* Matrix lower bandwidth */ // #define GBMV_KU const int ku, /* Matrix upper bandwidth */ #ifndef __GBMV_INTERNAL_ONCE #define __GBMV_INTERNAL_ONCE static inline int ndxmin(int a, int b) { return a < b ? a : b; } static inline int ndxmax(int a, int b) { return a > b ? a : b; } #endif /* __GBMV_INTERNAL_ONCE */ GBMV_STATIC int GBMV_FUNCTION( char trans, const int m, const int n, GBMV_KL GBMV_KU const GBMV_SCALAR alpha, const GBMV_COMPONENT * restrict a, int lda, const GBMV_SCALAR * const restrict x, const int incx, const GBMV_SCALAR beta, GBMV_SCALAR * const restrict y, const int incy) { // Logic from http://www.netlib.org/blas/dgbmv.f revised // for zero-indexed, vectorization-ready code on GCC and Intel. trans = toupper(trans); // Simplifies case-insensitive checks // Test the input parameters if (trans != 'N' && trans != 'T' && trans != 'C') { return 1; } else if (m < 0) { return 2; } else if (n < 0) { return 3; } else if (kl < 0) { return 4; } else if (ku < 0) { return 5; } else if (lda <= kl + ku) { return 8; } else if (incx == 0) { return 10; } else if (incy == 0) { return 13; } // Quick return if possible if (m == 0 || n == 0 || (alpha == 0 && beta == 1)) { return 0; } // Set the lengths of the vectors x and y. const int lenx = trans == 'N' ? n : m; const int leny = trans == 'N' ? m : n; // Start the operations. In this version the elements of A are accessed // sequentially with one pass through the band part of A. // First form y := beta*y (ignoring the irrelevant incy sign)... if (beta != 1) { const int abs_incy = abs(incy); if (abs_incy == 1) { // ...for contiguous y if (beta == 0) { memset(y, 0, leny*sizeof(y[0])); } else { for (int i = 0; i < leny; ++i) { y[i] *= beta; } } } else { // ...for strided y if (beta == 0) { #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = 0; i < leny; ++i) { y[i*abs_incy] = 0; } } else { #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = 0; i < leny; ++i) { y[i*abs_incy] *= beta; } } } } // Quick return when the matrix used is irrelevant if (alpha == 0) { return 0; } // Set up the start points in x and y. int kx = incx > 0 ? 0 : incx*(1 - lenx); int ky = incy > 0 ? 0 : incy*(1 - leny); // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)]. // Incorporate the ku offset and decrement lda to speed indexing in loops. a += ku; --lda; // Perform the banded matrix-vector accumulation const int klp1 = kl + 1; if (trans == 'N') { // Form y := alpha*A*x + y... int jx = kx; if (incy == 1) { // ...for contiguous y for (int j = 0; j < n; ++j) { const int il = ndxmax(0, j - ku); const int iu = ndxmin(m, j + klp1); const GBMV_SCALAR temp = alpha*x[jx]; for (int i = il; i < iu; ++i) { y[i] += temp*a[i]; } jx += incx; a += lda; } } else { // ...for strided y for (int j = 0; j < n; ++j) { const int il = ndxmax(0, j - ku); const int iu = ndxmin(m, j + klp1); const GBMV_SCALAR temp = alpha*x[jx]; int iy = ky; #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = il; i < iu; ++i) { y[iy] += temp*a[i]; iy += incy; } jx += incx; if (j >= ku) { ky += incy; } a += lda; } } } else { // Form y := alpha*A**T*x + y... int jy = ky; if (incx == 1) { // ...for contiguous x for (int j = 0; j < n; ++j) { const int il = ndxmax(0, j - ku); const int iu = ndxmin(m, j + klp1); GBMV_SCALAR temp = 0; for (int i = il; i < iu; ++i) { temp += a[i]*x[i]; } y[jy] += alpha * temp; jy += incy; a += lda; } } else { // ...for strided x for (int j = 0; j < n; ++j) { const int il = ndxmax(0, j - ku); const int iu = ndxmin(m, j + klp1); GBMV_SCALAR temp = 0; int ix = kx; #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = il; i < iu; ++i) { temp += a[i]*x[ix]; ix += incx; } y[jy] += alpha*temp; jy += incy; if (j >= ku) { kx += incx; } a += lda; } } } return 0; } #undef GBMV_STATIC #undef GBMV_FUNCTION #undef GBMV_COMPONENT #undef GBMV_SCALAR #undef GBMV_KL #undef GBMV_KU
The same tricks transfer over to dsbmv as well (though the studious will notice indexing differences relative to the Netlib version):
// #include-time parameters available followed by a sample usage. // Each of these are #undef-ed at the end of this template file // // #define SBMV_STATIC /*empty*/ /* Use for static function */ // #define SBMV_FUNCTION dsbmv /* Function name */ // #define SBMV_COMPONENT double /* Type of matrices */ // #define SBMV_SCALAR double /* Type of coeffs and vectors */ // #define SBMV_K const int k, /* Matrix bandwidth */ #ifndef __SBMV_INTERNAL_ONCE #define __SBMV_INTERNAL_ONCE static inline int ndxmin(int a, int b) { return a < b ? a : b; } static inline int ndxmax(int a, int b) { return a > b ? a : b; } #endif /* __SBMV_INTERNAL_ONCE */ SBMV_STATIC int SBMV_FUNCTION( char uplo, const int n, SBMV_K const SBMV_SCALAR alpha, const SBMV_COMPONENT * restrict a, int lda, const SBMV_SCALAR * const restrict x, const int incx, const SBMV_SCALAR beta, SBMV_SCALAR * const restrict y, const int incy) { // Logic from http://www.netlib.org/blas/dsbmv.f revised // for zero-indexed, vectorization-ready code on GCC and Intel. uplo = toupper(uplo); // Simplifies case-insensitive checks // Test the input parameters if (uplo != 'U' && uplo != 'L') { return 1; } else if (n < 0) { return 2; } else if (k < 0) { return 3; } else if (lda <= k) { return 6; } else if (incx == 0) { return 8; } else if (incy == 0) { return 11; } // Quick return if possible if (n == 0 || (alpha == 0 && beta == 1)) { return 0; } // Start the operations. In this version the elements of A are accessed // sequentially with one pass through the band part of A. // First form y := beta*y (ignoring the irrelevant incy sign)... if (beta != 1) { const int abs_incy = abs(incy); if (abs_incy == 1) { // ...for contiguous y if (beta == 0) { memset(y, 0, n * sizeof(y[0])); } else { for (int i = 0; i < n; ++i) { y[i] *= beta; } } } else { // ...for strided y if (beta == 0) { #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = 0; i < n; ++i) { y[i*abs_incy] = 0; } } else { #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = 0; i < n; ++i) { y[i*abs_incy] *= beta; } } } } // Quick return when the matrix used is irrelevant if (alpha == 0) { return 0; } // Set up the start points in x and y. int kx = incx > 0 ? 0 : incx*(1 - n); int ky = incy > 0 ? 0 : incy*(1 - n); // Perform the banded matrix-vector accumulation if (uplo == 'U') { // Form y := alpha*A*x + y for A upper storage... // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)]. // Incorporate the ku offset and decrement lda to speed indexing. a += k; --lda; if (incx == 1 && incy == 1) { // ...for contiguous x and y for (int j = 0; j < n; ++j) { const int il = ndxmax(0, j - k); const SBMV_SCALAR temp1 = alpha*x[j]; SBMV_SCALAR temp2 = 0; for (int i = il; i < j; ++i) { y[i] += temp1*a[i]; temp2 += a[i]*x[i]; } y[j] += temp1*a[j] + alpha*temp2; a += lda; } } else { // ...for strided x and/or strided y int jx = kx; int jy = ky; for (int j = 0; j < n; ++j) { const int il = ndxmax(0, j - k); const SBMV_SCALAR temp1 = alpha*x[jx]; SBMV_SCALAR temp2 = 0; int ix = kx; int iy = ky; #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = il; i < j; ++i) { y[iy] += temp1*a[i]; temp2 += a[i]*x[ix]; ix += incx; iy += incy; } y[jy] += temp1*a[j] + alpha*temp2; jx += incx; jy += incy; if (j >= k) { kx += incx; ky += incy; } a += lda; } } } else { // Form y := alpha*A*x + y for A lower storage... // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)] // where here ku == 0. Decrement lda to speed indexing. --lda; const int kp1 = k + 1; if (incx == 1 && incy == 1) { // ...for contiguous x and y for (int j = 0; j < n; ++j) { const int iu = ndxmin(n, j + kp1); const SBMV_SCALAR temp1 = alpha*x[j]; SBMV_SCALAR temp2 = 0; y[j] += temp1*a[j]; for (int i = j + 1; i < iu; ++i) { y[i] += temp1*a[i]; temp2 += a[i]*x[i]; } y[j] += alpha*temp2; a += lda; } } else { // ...for strided x and/or strided y int jx = kx; int jy = ky; for (int j = 0; j < n; ++j) { const int iu = ndxmin(n, j + kp1); const SBMV_SCALAR temp1 = alpha*x[jx]; SBMV_SCALAR temp2 = 0; y[jy] += temp1*a[j]; int ix = jx; int iy = jy; #ifdef __INTEL_COMPILER #pragma unroll #endif for (int i = j + 1; i < iu; ++i) { ix += incx; iy += incy; y[iy] += temp1*a[i]; temp2 += a[i]*x[ix]; } y[jy] += alpha*temp2; jx += incx; jy += incy; a += lda; } } } return 0; } #undef SBMV_STATIC #undef SBMV_FUNCTION #undef SBMV_COMPONENT #undef SBMV_SCALAR #undef SBMV_K
Hopefully these will be useful to someone else.
Update 11 January 2012:Of course, now that I've found the reference XBLAS, pretty much everything here could (should?) have been done within XBLAS' macro framework. XBLAS definitely a cleaner mixed precision approach.