Specializations of general banded matrix-vector multiplication
I keep needing mixed real/complex level 2 BLAS operations for real-valued, general banded matrices. It's simple to modify Netlib's dgbmv for mixed real/complex data but the result always feels unsatisfactory. Partially because I'd like to avoid a Fortran dependency in my project. Partially because I'd like to build kernels for the special case where I know my matrix's upper and lower bandwidth (as optimizers can rip there according to fellow student Myoungkyu Lee).
I finally got off (err... on?) my duff and reworked dgbmv. The result is a snippet which can be #included from a C99 source file where the proper defines turn it into a real, complex, or mixed real/complex routine. I've uglified it sufficiently that both GNU and Intel's compiler's inform me at -O3 that they've vectorized the loops handling both A and A**T cases for contiguous vectors.
// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define GBMV_STATIC /*empty*/ /* Use for static function */
// #define GBMV_FUNCTION dgbmv /* Function name */
// #define GBMV_COMPONENT double /* Type of matrices */
// #define GBMV_SCALAR double /* Type of coeffs and vectors */
// #define GBMV_KL const int kl, /* Matrix lower bandwidth */
// #define GBMV_KU const int ku, /* Matrix upper bandwidth */
#ifndef __GBMV_INTERNAL_ONCE
#define __GBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __GBMV_INTERNAL_ONCE */
GBMV_STATIC int GBMV_FUNCTION(
char trans, const int m, const int n, GBMV_KL GBMV_KU
const GBMV_SCALAR alpha, const GBMV_COMPONENT * restrict a, int lda,
const GBMV_SCALAR * const restrict x, const int incx,
const GBMV_SCALAR beta, GBMV_SCALAR * const restrict y, const int incy)
{
// Logic from http://www.netlib.org/blas/dgbmv.f revised
// for zero-indexed, vectorization-ready code on GCC and Intel.
trans = toupper(trans); // Simplifies case-insensitive checks
// Test the input parameters
if (trans != 'N' && trans != 'T' && trans != 'C') {
return 1;
} else if (m < 0) {
return 2;
} else if (n < 0) {
return 3;
} else if (kl < 0) {
return 4;
} else if (ku < 0) {
return 5;
} else if (lda <= kl + ku) {
return 8;
} else if (incx == 0) {
return 10;
} else if (incy == 0) {
return 13;
}
// Quick return if possible
if (m == 0 || n == 0 || (alpha == 0 && beta == 1)) {
return 0;
}
// Set the lengths of the vectors x and y.
const int lenx = trans == 'N' ? n : m;
const int leny = trans == 'N' ? m : n;
// Start the operations. In this version the elements of A are accessed
// sequentially with one pass through the band part of A.
// First form y := beta*y (ignoring the irrelevant incy sign)...
if (beta != 1) {
const int abs_incy = abs(incy);
if (abs_incy == 1) { // ...for contiguous y
if (beta == 0) {
memset(y, 0, leny*sizeof(y[0]));
} else {
for (int i = 0; i < leny; ++i) {
y[i] *= beta;
}
}
} else { // ...for strided y
if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < leny; ++i) {
y[i*abs_incy] = 0;
}
} else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < leny; ++i) {
y[i*abs_incy] *= beta;
}
}
}
}
// Quick return when the matrix used is irrelevant
if (alpha == 0) {
return 0;
}
// Set up the start points in x and y.
int kx = incx > 0 ? 0 : incx*(1 - lenx);
int ky = incy > 0 ? 0 : incy*(1 - leny);
// Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
// Incorporate the ku offset and decrement lda to speed indexing in loops.
a += ku;
--lda;
// Perform the banded matrix-vector accumulation
const int klp1 = kl + 1;
if (trans == 'N') { // Form y := alpha*A*x + y...
int jx = kx;
if (incy == 1) { // ...for contiguous y
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
const GBMV_SCALAR temp = alpha*x[jx];
for (int i = il; i < iu; ++i) {
y[i] += temp*a[i];
}
jx += incx;
a += lda;
}
} else { // ...for strided y
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
const GBMV_SCALAR temp = alpha*x[jx];
int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = il; i < iu; ++i) {
y[iy] += temp*a[i];
iy += incy;
}
jx += incx;
if (j >= ku) {
ky += incy;
}
a += lda;
}
}
} else { // Form y := alpha*A**T*x + y...
int jy = ky;
if (incx == 1) { // ...for contiguous x
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
GBMV_SCALAR temp = 0;
for (int i = il; i < iu; ++i) {
temp += a[i]*x[i];
}
y[jy] += alpha * temp;
jy += incy;
a += lda;
}
} else { // ...for strided x
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
GBMV_SCALAR temp = 0;
int ix = kx;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = il; i < iu; ++i) {
temp += a[i]*x[ix];
ix += incx;
}
y[jy] += alpha*temp;
jy += incy;
if (j >= ku) {
kx += incx;
}
a += lda;
}
}
}
return 0;
}
#undef GBMV_STATIC
#undef GBMV_FUNCTION
#undef GBMV_COMPONENT
#undef GBMV_SCALAR
#undef GBMV_KL
#undef GBMV_KU
The same tricks transfer over to dsbmv as well (though the studious will notice indexing differences relative to the Netlib version):
// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define SBMV_STATIC /*empty*/ /* Use for static function */
// #define SBMV_FUNCTION dsbmv /* Function name */
// #define SBMV_COMPONENT double /* Type of matrices */
// #define SBMV_SCALAR double /* Type of coeffs and vectors */
// #define SBMV_K const int k, /* Matrix bandwidth */
#ifndef __SBMV_INTERNAL_ONCE
#define __SBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __SBMV_INTERNAL_ONCE */
SBMV_STATIC int SBMV_FUNCTION(
char uplo, const int n, SBMV_K
const SBMV_SCALAR alpha, const SBMV_COMPONENT * restrict a, int lda,
const SBMV_SCALAR * const restrict x, const int incx,
const SBMV_SCALAR beta, SBMV_SCALAR * const restrict y, const int incy)
{
// Logic from http://www.netlib.org/blas/dsbmv.f revised
// for zero-indexed, vectorization-ready code on GCC and Intel.
uplo = toupper(uplo); // Simplifies case-insensitive checks
// Test the input parameters
if (uplo != 'U' && uplo != 'L') {
return 1;
} else if (n < 0) {
return 2;
} else if (k < 0) {
return 3;
} else if (lda <= k) {
return 6;
} else if (incx == 0) {
return 8;
} else if (incy == 0) {
return 11;
}
// Quick return if possible
if (n == 0 || (alpha == 0 && beta == 1)) {
return 0;
}
// Start the operations. In this version the elements of A are accessed
// sequentially with one pass through the band part of A.
// First form y := beta*y (ignoring the irrelevant incy sign)...
if (beta != 1) {
const int abs_incy = abs(incy);
if (abs_incy == 1) { // ...for contiguous y
if (beta == 0) {
memset(y, 0, n * sizeof(y[0]));
} else {
for (int i = 0; i < n; ++i) {
y[i] *= beta;
}
}
} else { // ...for strided y
if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < n; ++i) {
y[i*abs_incy] = 0;
}
} else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < n; ++i) {
y[i*abs_incy] *= beta;
}
}
}
}
// Quick return when the matrix used is irrelevant
if (alpha == 0) {
return 0;
}
// Set up the start points in x and y.
int kx = incx > 0 ? 0 : incx*(1 - n);
int ky = incy > 0 ? 0 : incy*(1 - n);
// Perform the banded matrix-vector accumulation
if (uplo == 'U') { // Form y := alpha*A*x + y for A upper storage...
// Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
// Incorporate the ku offset and decrement lda to speed indexing.
a += k;
--lda;
if (incx == 1 && incy == 1) { // ...for contiguous x and y
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - k);
const SBMV_SCALAR temp1 = alpha*x[j];
SBMV_SCALAR temp2 = 0;
for (int i = il; i < j; ++i) {
y[i] += temp1*a[i];
temp2 += a[i]*x[i];
}
y[j] += temp1*a[j] + alpha*temp2;
a += lda;
}
} else { // ...for strided x and/or strided y
int jx = kx;
int jy = ky;
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - k);
const SBMV_SCALAR temp1 = alpha*x[jx];
SBMV_SCALAR temp2 = 0;
int ix = kx;
int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = il; i < j; ++i) {
y[iy] += temp1*a[i];
temp2 += a[i]*x[ix];
ix += incx;
iy += incy;
}
y[jy] += temp1*a[j] + alpha*temp2;
jx += incx;
jy += incy;
if (j >= k) {
kx += incx;
ky += incy;
}
a += lda;
}
}
} else { // Form y := alpha*A*x + y for A lower storage...
// Banded matrix dereference is always of form a[ku + i + j*(lda - 1)]
// where here ku == 0. Decrement lda to speed indexing.
--lda;
const int kp1 = k + 1;
if (incx == 1 && incy == 1) { // ...for contiguous x and y
for (int j = 0; j < n; ++j) {
const int iu = ndxmin(n, j + kp1);
const SBMV_SCALAR temp1 = alpha*x[j];
SBMV_SCALAR temp2 = 0;
y[j] += temp1*a[j];
for (int i = j + 1; i < iu; ++i) {
y[i] += temp1*a[i];
temp2 += a[i]*x[i];
}
y[j] += alpha*temp2;
a += lda;
}
} else { // ...for strided x and/or strided y
int jx = kx;
int jy = ky;
for (int j = 0; j < n; ++j) {
const int iu = ndxmin(n, j + kp1);
const SBMV_SCALAR temp1 = alpha*x[jx];
SBMV_SCALAR temp2 = 0;
y[jy] += temp1*a[j];
int ix = jx;
int iy = jy;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = j + 1; i < iu; ++i) {
ix += incx;
iy += incy;
y[iy] += temp1*a[i];
temp2 += a[i]*x[ix];
}
y[jy] += alpha*temp2;
jx += incx;
jy += incy;
a += lda;
}
}
}
return 0;
}
#undef SBMV_STATIC
#undef SBMV_FUNCTION
#undef SBMV_COMPONENT
#undef SBMV_SCALAR
#undef SBMV_K
Hopefully these will be useful to someone else.
Update 11 January 2012:Of course, now that I've found the reference XBLAS, pretty much everything here could (should?) have been done within XBLAS' macro framework. XBLAS definitely a cleaner mixed precision approach.








