michellethompson.org
Two years after our friend Nancy snagged the domain, we finally put up some content at http://michellethompson.org/...
Two years after our friend Nancy snagged the domain, we finally put up some content at http://michellethompson.org/...
I keep needing mixed real/complex level 2 BLAS operations for real-valued, general banded matrices. It's simple to modify Netlib's dgbmv for mixed real/complex data but the result always feels unsatisfactory. Partially because I'd like to avoid a Fortran dependency in my project. Partially because I'd like to build kernels for the special case where I know my matrix's upper and lower bandwidth (as optimizers can rip there according to fellow student Myoungkyu Lee).
I finally got off (err... on?) my duff and reworked dgbmv. The result is a snippet which can be #included from a C99 source file where the proper defines turn it into a real, complex, or mixed real/complex routine. I've uglified it sufficiently that both GNU and Intel's compiler's inform me at -O3 that they've vectorized the loops handling both A and A**T cases for contiguous vectors.
// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define GBMV_STATIC /*empty*/ /* Use for static function */
// #define GBMV_FUNCTION dgbmv /* Function name */
// #define GBMV_COMPONENT double /* Type of matrices */
// #define GBMV_SCALAR double /* Type of coeffs and vectors */
// #define GBMV_KL const int kl, /* Matrix lower bandwidth */
// #define GBMV_KU const int ku, /* Matrix upper bandwidth */
#ifndef __GBMV_INTERNAL_ONCE
#define __GBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __GBMV_INTERNAL_ONCE */
GBMV_STATIC int GBMV_FUNCTION(
char trans, const int m, const int n, GBMV_KL GBMV_KU
const GBMV_SCALAR alpha, const GBMV_COMPONENT * restrict a, int lda,
const GBMV_SCALAR * const restrict x, const int incx,
const GBMV_SCALAR beta, GBMV_SCALAR * const restrict y, const int incy)
{
// Logic from http://www.netlib.org/blas/dgbmv.f revised
// for zero-indexed, vectorization-ready code on GCC and Intel.
trans = toupper(trans); // Simplifies case-insensitive checks
// Test the input parameters
if (trans != 'N' && trans != 'T' && trans != 'C') {
return 1;
} else if (m < 0) {
return 2;
} else if (n < 0) {
return 3;
} else if (kl < 0) {
return 4;
} else if (ku < 0) {
return 5;
} else if (lda <= kl + ku) {
return 8;
} else if (incx == 0) {
return 10;
} else if (incy == 0) {
return 13;
}
// Quick return if possible
if (m == 0 || n == 0 || (alpha == 0 && beta == 1)) {
return 0;
}
// Set the lengths of the vectors x and y.
const int lenx = trans == 'N' ? n : m;
const int leny = trans == 'N' ? m : n;
// Start the operations. In this version the elements of A are accessed
// sequentially with one pass through the band part of A.
// First form y := beta*y (ignoring the irrelevant incy sign)...
if (beta != 1) {
const int abs_incy = abs(incy);
if (abs_incy == 1) { // ...for contiguous y
if (beta == 0) {
memset(y, 0, leny*sizeof(y[0]));
} else {
for (int i = 0; i < leny; ++i) {
y[i] *= beta;
}
}
} else { // ...for strided y
if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < leny; ++i) {
y[i*abs_incy] = 0;
}
} else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < leny; ++i) {
y[i*abs_incy] *= beta;
}
}
}
}
// Quick return when the matrix used is irrelevant
if (alpha == 0) {
return 0;
}
// Set up the start points in x and y.
int kx = incx > 0 ? 0 : incx*(1 - lenx);
int ky = incy > 0 ? 0 : incy*(1 - leny);
// Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
// Incorporate the ku offset and decrement lda to speed indexing in loops.
a += ku;
--lda;
// Perform the banded matrix-vector accumulation
const int klp1 = kl + 1;
if (trans == 'N') { // Form y := alpha*A*x + y...
int jx = kx;
if (incy == 1) { // ...for contiguous y
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
const GBMV_SCALAR temp = alpha*x[jx];
for (int i = il; i < iu; ++i) {
y[i] += temp*a[i];
}
jx += incx;
a += lda;
}
} else { // ...for strided y
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
const GBMV_SCALAR temp = alpha*x[jx];
int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = il; i < iu; ++i) {
y[iy] += temp*a[i];
iy += incy;
}
jx += incx;
if (j >= ku) {
ky += incy;
}
a += lda;
}
}
} else { // Form y := alpha*A**T*x + y...
int jy = ky;
if (incx == 1) { // ...for contiguous x
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
GBMV_SCALAR temp = 0;
for (int i = il; i < iu; ++i) {
temp += a[i]*x[i];
}
y[jy] += alpha * temp;
jy += incy;
a += lda;
}
} else { // ...for strided x
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - ku);
const int iu = ndxmin(m, j + klp1);
GBMV_SCALAR temp = 0;
int ix = kx;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = il; i < iu; ++i) {
temp += a[i]*x[ix];
ix += incx;
}
y[jy] += alpha*temp;
jy += incy;
if (j >= ku) {
kx += incx;
}
a += lda;
}
}
}
return 0;
}
#undef GBMV_STATIC
#undef GBMV_FUNCTION
#undef GBMV_COMPONENT
#undef GBMV_SCALAR
#undef GBMV_KL
#undef GBMV_KU
The same tricks transfer over to dsbmv as well (though the studious will notice indexing differences relative to the Netlib version):
// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define SBMV_STATIC /*empty*/ /* Use for static function */
// #define SBMV_FUNCTION dsbmv /* Function name */
// #define SBMV_COMPONENT double /* Type of matrices */
// #define SBMV_SCALAR double /* Type of coeffs and vectors */
// #define SBMV_K const int k, /* Matrix bandwidth */
#ifndef __SBMV_INTERNAL_ONCE
#define __SBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __SBMV_INTERNAL_ONCE */
SBMV_STATIC int SBMV_FUNCTION(
char uplo, const int n, SBMV_K
const SBMV_SCALAR alpha, const SBMV_COMPONENT * restrict a, int lda,
const SBMV_SCALAR * const restrict x, const int incx,
const SBMV_SCALAR beta, SBMV_SCALAR * const restrict y, const int incy)
{
// Logic from http://www.netlib.org/blas/dsbmv.f revised
// for zero-indexed, vectorization-ready code on GCC and Intel.
uplo = toupper(uplo); // Simplifies case-insensitive checks
// Test the input parameters
if (uplo != 'U' && uplo != 'L') {
return 1;
} else if (n < 0) {
return 2;
} else if (k < 0) {
return 3;
} else if (lda <= k) {
return 6;
} else if (incx == 0) {
return 8;
} else if (incy == 0) {
return 11;
}
// Quick return if possible
if (n == 0 || (alpha == 0 && beta == 1)) {
return 0;
}
// Start the operations. In this version the elements of A are accessed
// sequentially with one pass through the band part of A.
// First form y := beta*y (ignoring the irrelevant incy sign)...
if (beta != 1) {
const int abs_incy = abs(incy);
if (abs_incy == 1) { // ...for contiguous y
if (beta == 0) {
memset(y, 0, n * sizeof(y[0]));
} else {
for (int i = 0; i < n; ++i) {
y[i] *= beta;
}
}
} else { // ...for strided y
if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < n; ++i) {
y[i*abs_incy] = 0;
}
} else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = 0; i < n; ++i) {
y[i*abs_incy] *= beta;
}
}
}
}
// Quick return when the matrix used is irrelevant
if (alpha == 0) {
return 0;
}
// Set up the start points in x and y.
int kx = incx > 0 ? 0 : incx*(1 - n);
int ky = incy > 0 ? 0 : incy*(1 - n);
// Perform the banded matrix-vector accumulation
if (uplo == 'U') { // Form y := alpha*A*x + y for A upper storage...
// Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
// Incorporate the ku offset and decrement lda to speed indexing.
a += k;
--lda;
if (incx == 1 && incy == 1) { // ...for contiguous x and y
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - k);
const SBMV_SCALAR temp1 = alpha*x[j];
SBMV_SCALAR temp2 = 0;
for (int i = il; i < j; ++i) {
y[i] += temp1*a[i];
temp2 += a[i]*x[i];
}
y[j] += temp1*a[j] + alpha*temp2;
a += lda;
}
} else { // ...for strided x and/or strided y
int jx = kx;
int jy = ky;
for (int j = 0; j < n; ++j) {
const int il = ndxmax(0, j - k);
const SBMV_SCALAR temp1 = alpha*x[jx];
SBMV_SCALAR temp2 = 0;
int ix = kx;
int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = il; i < j; ++i) {
y[iy] += temp1*a[i];
temp2 += a[i]*x[ix];
ix += incx;
iy += incy;
}
y[jy] += temp1*a[j] + alpha*temp2;
jx += incx;
jy += incy;
if (j >= k) {
kx += incx;
ky += incy;
}
a += lda;
}
}
} else { // Form y := alpha*A*x + y for A lower storage...
// Banded matrix dereference is always of form a[ku + i + j*(lda - 1)]
// where here ku == 0. Decrement lda to speed indexing.
--lda;
const int kp1 = k + 1;
if (incx == 1 && incy == 1) { // ...for contiguous x and y
for (int j = 0; j < n; ++j) {
const int iu = ndxmin(n, j + kp1);
const SBMV_SCALAR temp1 = alpha*x[j];
SBMV_SCALAR temp2 = 0;
y[j] += temp1*a[j];
for (int i = j + 1; i < iu; ++i) {
y[i] += temp1*a[i];
temp2 += a[i]*x[i];
}
y[j] += alpha*temp2;
a += lda;
}
} else { // ...for strided x and/or strided y
int jx = kx;
int jy = ky;
for (int j = 0; j < n; ++j) {
const int iu = ndxmin(n, j + kp1);
const SBMV_SCALAR temp1 = alpha*x[jx];
SBMV_SCALAR temp2 = 0;
y[jy] += temp1*a[j];
int ix = jx;
int iy = jy;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
for (int i = j + 1; i < iu; ++i) {
ix += incx;
iy += incy;
y[iy] += temp1*a[i];
temp2 += a[i]*x[ix];
}
y[jy] += alpha*temp2;
jx += incx;
jy += incy;
a += lda;
}
}
}
return 0;
}
#undef SBMV_STATIC
#undef SBMV_FUNCTION
#undef SBMV_COMPONENT
#undef SBMV_SCALAR
#undef SBMV_K
Hopefully these will be useful to someone else.
Update 11 January 2012:Of course, now that I've found the reference XBLAS, pretty much everything here could (should?) have been done within XBLAS' macro framework. XBLAS definitely a cleaner mixed precision approach.
Following up on previous failures, I've chosen to test a portion of my compressible channel code Suzerain across a small parameter sweep. This problem is ill-suited for a traditional HPC resource as many of the jobs will fail and none of them have to run for very long.
The problem is perfect for GNU parallel:
#!/bin/bash
# Build the fixed parameters for channel initialization
fixed="--Ma=1.5 --Re=3000 --Pr=0.7 --gamma=1.4 --beta=0.7 --k=8"
# Build the parameter sweep affix for GNU parallel, placeholders, and a case description
sweep="" ; case="case{#}" ; args="$case"
sweep="$sweep ::: 96 120" ; Nx="--Nx={1}" ; args="$args $Nx"
sweep="$sweep ::: 72 96" ; Ny="--Ny={2}" ; args="$args $Ny"
sweep="$sweep ::: 2 3" ; htdelta="--htdelta={3}" ; args="$args $htdelta"
sweep="$sweep ::: 60 96" ; Nz="--Nz={4}" ; args="$args $Nz"
sweep="$sweep ::: 125 150" ; fluct_percent="--fluct_percent={5}" ; args="$args $fluct_percent"
sweep="$sweep ::: 12345 67890" ; fluct_seed="--fluct_seed={6}" ; args="$args $fluct_seed"
sweep="$sweep ::: 2 5" ; alpha="--alpha={7}" ; args="$args $alpha"
sweep="$sweep ::: .4 .5" ; npower="--npower={8}" ; args="$args $npower"
sweep="$sweep ::: 0:1 .5:.75" ; fluct_kxfrac="--fluct_kxfrac={9}" ; args="$args $fluct_kxfrac"
sweep="$sweep ::: 0:1 .5:.75" ; fluct_kzfrac="--fluct_kzfrac={10}" ; args="$args $fluct_kzfrac"
# Remove any stale data
rm -rfv case* bad.case*
# Generate a master table of test cases for fun and profit
parallel -k -j 1 echo $args $sweep | column -t > master
# Generate inputs for a single test case
parallel "mkdir $case && cd $case && echo $args > args" \
" && ../bin/channel_init initial.h5 $fixed $Nx $Ny $htdelta $Nz $alpha $npower" \
" && ../bin/channel_explicit initial.h5 --advance_nt=0 $fluct_percent $fluct_seed $fluct_kxfrac $fluct_kzfrac" \
$sweep
# Run each case for 1000 time steps to shake out non-robust scenarios
parallel --eta -j 3 -u \
"cd {} && ../bin/channel_explicit restart0.h5 --advance_nt=1000 --status_nt=5" \
::: case*
# Flag test cases that glaringly failed with a ".bad" suffix
parallel "mv {//} bad.{//}" ::: $(grep -l "TimeController halted" case*/log.dat)
The first two "blocks" define the parameter sweep using parallel's feature where "::: a b ::: 1 2" turns into the outer product "a 1", "a 2", "b 1", "b 2". The next bit builds a table of these outer products so I can refer to it later. The remaining three bits perform some IO-heavy initialization, some compute-intensive tasks, and then a post-processing pass on each of the entries in said outer product. This logic, and the associated runtime process juggling to get nice batch throughput, would be hell without GNU parallel. Thanks Ole.
Suzerain, my pseudospectral, compressible DNS code, is coming along. After some manufactured solution love I'm onto channel problems. With some added solenoidal velocity perturbations my laminar solutions transitioned to turbulent ones quite nicely.
However, I'm currently puzzling over why my Coleman-like case is showing stationary mean state behavior...