The Return of Agent Zlerich: 2011

24 December 2011

Specializations of general banded matrix-vector multiplication

I keep needing mixed real/complex level 2 BLAS operations for real-valued, general banded matrices. It's simple to modify Netlib's dgbmv for mixed real/complex data but the result always feels unsatisfactory. Partially because I'd like to avoid a Fortran dependency in my project. Partially because I'd like to build kernels for the special case where I know my matrix's upper and lower bandwidth (as optimizers can rip there according to fellow student Myoungkyu Lee).

I finally got off (err... on?) my duff and reworked dgbmv. The result is a snippet which can be #included from a C99 source file where the proper defines turn it into a real, complex, or mixed real/complex routine. I've uglified it sufficiently that both GNU and Intel's compiler's inform me at -O3 that they've vectorized the loops handling both A and A**T cases for contiguous vectors.

// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define GBMV_STATIC    /*empty*/         /* Use for static function    */
// #define GBMV_FUNCTION  dgbmv             /* Function name              */
// #define GBMV_COMPONENT double            /* Type of matrices           */
// #define GBMV_SCALAR    double            /* Type of coeffs and vectors */
// #define GBMV_KL        const int kl,     /* Matrix lower bandwidth     */
// #define GBMV_KU        const int ku,     /* Matrix upper bandwidth     */

#ifndef __GBMV_INTERNAL_ONCE
#define __GBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __GBMV_INTERNAL_ONCE */

GBMV_STATIC int GBMV_FUNCTION(
    char trans, const int m, const int n, GBMV_KL GBMV_KU
    const GBMV_SCALAR alpha, const GBMV_COMPONENT *       restrict a, int lda,
                             const    GBMV_SCALAR * const restrict x, const int incx,
    const GBMV_SCALAR beta,           GBMV_SCALAR * const restrict y, const int incy)
{
    // Logic from http://www.netlib.org/blas/dgbmv.f revised
    // for zero-indexed, vectorization-ready code on GCC and Intel.

    trans = toupper(trans);  // Simplifies case-insensitive checks

    // Test the input parameters
    if (trans != 'N' && trans != 'T' && trans != 'C') {
        return 1;
    } else if (m < 0) {
        return 2;
    } else if (n < 0) {
        return 3;
    } else if (kl < 0) {
        return 4;
    } else if (ku < 0) {
        return 5;
    } else if (lda <= kl + ku) {
        return 8;
    } else if (incx == 0) {
        return 10;
    } else if (incy == 0) {
        return 13;
    }

    // Quick return if possible
    if (m == 0 || n == 0 || (alpha == 0 && beta == 1)) {
        return 0;
    }

    // Set the lengths of the vectors x and y.
    const int lenx = trans == 'N' ? n : m;
    const int leny = trans == 'N' ? m : n;

    // Start the operations. In this version the elements of A are accessed
    // sequentially with one pass through the band part of A.

    // First form y := beta*y (ignoring the irrelevant incy sign)...
    if (beta != 1) {
        const int abs_incy = abs(incy);
        if (abs_incy == 1) {  // ...for contiguous y
            if (beta == 0) {
                memset(y, 0, leny*sizeof(y[0]));
            } else {
                for (int i = 0; i < leny; ++i) {
                    y[i] *= beta;
                }
            }
        } else {              // ...for strided y
            if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < leny; ++i) {
                    y[i*abs_incy] = 0;
                }
            } else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < leny; ++i) {
                    y[i*abs_incy] *= beta;
                }
            }
        }
    }

    // Quick return when the matrix used is irrelevant
    if (alpha == 0) {
        return 0;
    }

    // Set up the start points in x and y.
    int kx = incx > 0 ? 0 : incx*(1 - lenx);
    int ky = incy > 0 ? 0 : incy*(1 - leny);

    // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
    // Incorporate the ku offset and decrement lda to speed indexing in loops.
    a += ku;
    --lda;

    // Perform the banded matrix-vector accumulation
    const int klp1 = kl + 1;

    if (trans == 'N') {   // Form y := alpha*A*x + y...

        int jx = kx;
        if (incy == 1) {  // ...for contiguous y
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                const GBMV_SCALAR temp = alpha*x[jx];
                for (int i = il; i < iu; ++i) {
                    y[i] += temp*a[i];
                }
                jx += incx;
                a += lda;
            }
        } else {          // ...for strided y
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                const GBMV_SCALAR temp = alpha*x[jx];
                int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = il; i < iu; ++i) {
                    y[iy] += temp*a[i];
                    iy += incy;
                }
                jx += incx;
                if (j >= ku) {
                    ky += incy;
                }
                a += lda;
            }
        }

    } else {              // Form y := alpha*A**T*x + y...

        int jy = ky;
        if (incx == 1) {  // ...for contiguous x
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                GBMV_SCALAR temp = 0;
                for (int i = il; i < iu; ++i) {
                    temp += a[i]*x[i];
                }
                y[jy] += alpha * temp;
                jy += incy;
                a += lda;
            }
        } else {          // ...for strided x
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - ku);
                const int iu = ndxmin(m, j + klp1);
                GBMV_SCALAR temp = 0;
                int ix = kx;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = il; i < iu; ++i) {
                    temp += a[i]*x[ix];
                    ix += incx;
                }
                y[jy] += alpha*temp;
                jy += incy;
                if (j >= ku) {
                    kx += incx;
                }
                a += lda;
            }
        }
    }

    return 0;
}

#undef GBMV_STATIC
#undef GBMV_FUNCTION
#undef GBMV_COMPONENT
#undef GBMV_SCALAR
#undef GBMV_KL
#undef GBMV_KU

The same tricks transfer over to dsbmv as well (though the studious will notice indexing differences relative to the Netlib version):

// #include-time parameters available followed by a sample usage.
// Each of these are #undef-ed at the end of this template file
//
// #define SBMV_STATIC    /*empty*/         /* Use for static function    */
// #define SBMV_FUNCTION  dsbmv             /* Function name              */
// #define SBMV_COMPONENT double            /* Type of matrices           */
// #define SBMV_SCALAR    double            /* Type of coeffs and vectors */
// #define SBMV_K         const int k,      /* Matrix bandwidth           */

#ifndef __SBMV_INTERNAL_ONCE
#define __SBMV_INTERNAL_ONCE
static inline int ndxmin(int a, int b) { return a < b ? a : b; }
static inline int ndxmax(int a, int b) { return a > b ? a : b; }
#endif /* __SBMV_INTERNAL_ONCE */

SBMV_STATIC int SBMV_FUNCTION(
    char uplo, const int n, SBMV_K
    const SBMV_SCALAR alpha, const SBMV_COMPONENT *       restrict a, int lda,
                             const    SBMV_SCALAR * const restrict x, const int incx,
    const SBMV_SCALAR beta,           SBMV_SCALAR * const restrict y, const int incy)
{
    // Logic from http://www.netlib.org/blas/dsbmv.f revised
    // for zero-indexed, vectorization-ready code on GCC and Intel.

    uplo = toupper(uplo);  // Simplifies case-insensitive checks

    // Test the input parameters
    if (uplo != 'U' && uplo != 'L') {
        return 1;
    } else if (n < 0) {
        return 2;
    } else if (k < 0) {
        return 3;
    } else if (lda <= k) {
        return 6;
    } else if (incx == 0) {
        return 8;
    } else if (incy == 0) {
        return 11;
    }

    // Quick return if possible
    if (n == 0 || (alpha == 0 && beta == 1)) {
        return 0;
    }

    // Start the operations. In this version the elements of A are accessed
    // sequentially with one pass through the band part of A.

    // First form y := beta*y (ignoring the irrelevant incy sign)...
    if (beta != 1) {
        const int abs_incy = abs(incy);
        if (abs_incy == 1) {  // ...for contiguous y
            if (beta == 0) {
                memset(y, 0, n * sizeof(y[0]));
            } else {
                for (int i = 0; i < n; ++i) {
                    y[i] *= beta;
                }
            }
        } else {              // ...for strided y
            if (beta == 0) {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < n; ++i) {
                    y[i*abs_incy] = 0;
                }
            } else {
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = 0; i < n; ++i) {
                    y[i*abs_incy] *= beta;
                }
            }
        }
    }

    // Quick return when the matrix used is irrelevant
    if (alpha == 0) {
        return 0;
    }

    // Set up the start points in x and y.
    int kx = incx > 0 ? 0 : incx*(1 - n);
    int ky = incy > 0 ? 0 : incy*(1 - n);

    // Perform the banded matrix-vector accumulation

    if (uplo == 'U') {  // Form y := alpha*A*x + y for A upper storage...

        // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)].
        // Incorporate the ku offset and decrement lda to speed indexing.
        a += k;
        --lda;

        if (incx == 1 && incy == 1) {  // ...for contiguous x and y
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - k);
                const SBMV_SCALAR temp1 = alpha*x[j];
                SBMV_SCALAR       temp2 = 0;
                for (int i = il; i < j; ++i) {
                    y[i]  += temp1*a[i];
                    temp2 += a[i]*x[i];
                }
                y[j] += temp1*a[j] + alpha*temp2;
                a += lda;
            }
        } else {                       // ...for strided x and/or strided y
            int jx = kx;
            int jy = ky;
            for (int j = 0; j < n; ++j) {
                const int il = ndxmax(0, j - k);
                const SBMV_SCALAR temp1 = alpha*x[jx];
                SBMV_SCALAR temp2       = 0;
                int ix = kx;
                int iy = ky;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = il; i < j; ++i) {
                    y[iy] += temp1*a[i];
                    temp2 += a[i]*x[ix];
                    ix += incx;
                    iy += incy;
                }
                y[jy] += temp1*a[j] + alpha*temp2;
                jx += incx;
                jy += incy;
                if (j >= k) {
                    kx += incx;
                    ky += incy;
                }
                a += lda;
            }
        }

    } else {            // Form y := alpha*A*x + y for A lower storage...

        // Banded matrix dereference is always of form a[ku + i + j*(lda - 1)]
        // where here ku == 0.  Decrement lda to speed indexing.
        --lda;

        const int kp1 = k + 1;
        if (incx == 1 && incy == 1) {  // ...for contiguous x and y
            for (int j = 0; j < n; ++j) {
                const int iu = ndxmin(n, j + kp1);
                const SBMV_SCALAR temp1 = alpha*x[j];
                SBMV_SCALAR       temp2 = 0;
                y[j] += temp1*a[j];
                for (int i = j + 1; i < iu; ++i) {
                    y[i]  += temp1*a[i];
                    temp2 += a[i]*x[i];
                }
                y[j] += alpha*temp2;
                a += lda;
            }
        } else {                       // ...for strided x and/or strided y
            int jx = kx;
            int jy = ky;
            for (int j = 0; j < n; ++j) {
                const int iu = ndxmin(n, j + kp1);
                const SBMV_SCALAR temp1 = alpha*x[jx];
                SBMV_SCALAR temp2       = 0;
                y[jy] += temp1*a[j];
                int ix = jx;
                int iy = jy;
#ifdef __INTEL_COMPILER
#pragma unroll
#endif
                for (int i = j + 1; i < iu; ++i) {
                    ix += incx;
                    iy += incy;
                    y[iy] += temp1*a[i];
                    temp2 += a[i]*x[ix];
                }
                y[jy] += alpha*temp2;
                jx += incx;
                jy += incy;
                a += lda;
            }
        }

    }

    return 0;
}

#undef SBMV_STATIC
#undef SBMV_FUNCTION
#undef SBMV_COMPONENT
#undef SBMV_SCALAR
#undef SBMV_K

Hopefully these will be useful to someone else.

Update 11 January 2012:Of course, now that I've found the reference XBLAS, pretty much everything here could (should?) have been done within XBLAS' macro framework. XBLAS definitely a cleaner mixed precision approach.

08 September 2011

GNU parallel for parameter sweeps

Following up on previous failures, I've chosen to test a portion of my compressible channel code Suzerain across a small parameter sweep. This problem is ill-suited for a traditional HPC resource as many of the jobs will fail and none of them have to run for very long.

The problem is perfect for GNU parallel:

#!/bin/bash

# Build the fixed parameters for channel initialization
fixed="--Ma=1.5 --Re=3000 --Pr=0.7 --gamma=1.4 --beta=0.7 --k=8"

# Build the parameter sweep affix for GNU parallel, placeholders, and a case description
sweep=""                       ; case="case{#}"                      ; args="$case"
sweep="$sweep ::: 96 120"      ; Nx="--Nx={1}"                       ; args="$args $Nx"
sweep="$sweep ::: 72  96"      ; Ny="--Ny={2}"                       ; args="$args $Ny"
sweep="$sweep :::  2   3"      ; htdelta="--htdelta={3}"             ; args="$args $htdelta"
sweep="$sweep ::: 60  96"      ; Nz="--Nz={4}"                       ; args="$args $Nz"
sweep="$sweep ::: 125 150"     ; fluct_percent="--fluct_percent={5}" ; args="$args $fluct_percent"
sweep="$sweep ::: 12345 67890" ; fluct_seed="--fluct_seed={6}"       ; args="$args $fluct_seed"
sweep="$sweep :::  2   5"      ; alpha="--alpha={7}"                 ; args="$args $alpha"
sweep="$sweep ::: .4  .5"      ; npower="--npower={8}"               ; args="$args $npower"
sweep="$sweep ::: 0:1 .5:.75"  ; fluct_kxfrac="--fluct_kxfrac={9}"   ; args="$args $fluct_kxfrac"
sweep="$sweep ::: 0:1 .5:.75"  ; fluct_kzfrac="--fluct_kzfrac={10}"  ; args="$args $fluct_kzfrac"

# Remove any stale data
rm -rfv case* bad.case*

# Generate a master table of test cases for fun and profit
parallel -k -j 1 echo $args $sweep | column -t > master

# Generate inputs for a single test case
parallel "mkdir $case && cd $case && echo $args > args"                                                             \
     " && ../bin/channel_init initial.h5 $fixed $Nx $Ny $htdelta $Nz $alpha $npower"                                \
     " && ../bin/channel_explicit initial.h5 --advance_nt=0 $fluct_percent $fluct_seed $fluct_kxfrac $fluct_kzfrac" \
     $sweep

# Run each case for 1000 time steps to shake out non-robust scenarios
parallel --eta -j 3 -u                                                              \
     "cd {} && ../bin/channel_explicit restart0.h5 --advance_nt=1000 --status_nt=5" \
     ::: case*
 
# Flag test cases that glaringly failed with a ".bad" suffix
parallel "mv {//} bad.{//}" ::: $(grep -l "TimeController halted" case*/log.dat)

The first two "blocks" define the parameter sweep using parallel's feature where "::: a b ::: 1 2" turns into the outer product "a 1", "a 2", "b 1", "b 2". The next bit builds a table of these outer products so I can refer to it later. The remaining three bits perform some IO-heavy initialization, some compute-intensive tasks, and then a post-processing pass on each of the entries in said outer product. This logic, and the associated runtime process juggling to get nice batch throughput, would be hell without GNU parallel. Thanks Ole.

11 August 2011

ça marche (mal?)

Suzerain, my pseudospectral, compressible DNS code, is coming along. After some manufactured solution love I'm onto channel problems. With some added solenoidal velocity perturbations my laminar solutions transitioned to turbulent ones quite nicely.

However, I'm currently puzzling over why my Coleman-like case is showing stationary mean state behavior...

...but momentum fluctuations that grow in time:

Currently suspect things are:

Inadequate resolution ('tis quite lousy)
Pressure growing in time (a probable symptom I need to check but not a root cause)
Funkiness associated with the discrete mass conservation fixes (my numerics are not conservative and holding mass constant requires some integral constraint futzing)
Inadequate dealiasing (unlikely, currently using enough to handle quadratic nonlinearities in my Fourier directions but my equations are worse-than-cubic).

Thank you to Nick Malaya for poring over plots with me this morning.

09 August 2011

Welcome to a dumber planet.

Consider IBM's stock price over the last several years...

...and then today's news that IBM Terminated The Blue Waters Contract because of financial, rather than technical, reasons:

When HPCwire spoke with Herb Schultz, marketing manager for IBM's Deep Computing unit, last year, he outlined a new business model that would apply a lot more scrutiny to how the company positioned its high-end supercomputers. "There is really no appetite in IBM anymore -- with some of the leadership changes over the last few years -- for revenue that has no profit with it," he told us back in November 2010.... From NCSA's perspective, the system met all of its technical requirements. In particular, they appeared confident the machine, based on Power 755 servers, would indeed be able to deliver a sustained petaflop from its 10-petaflop peak performance.... The most curious aspect of the IBM pull-out is that they had already delivered three racks of Power 755 servers to NCSA.

It is a sad day when a screamingly profitable U.S. company reneges on a contract so fundamental to the next decade of basic U.S. science. Especially a contract where even the worst imaginable outcome for IBM has no potential to noticeably impact shareholder returns for even a single quarter.

09 July 2011

FFTW 3.3 MPI Cheat Sheet

Because I believe they provide a nice way to perform P3DFFT- and 2DECOMP&FFT-like MPI-parallel pencil decompositions, I have have spent some time staring at FFTW's 3.3 alpha and beta releases. In particular, poring over their Distributed-memory FFTW with MPI capabilities.

Using 3.3-alpha1 I wrote a small library (called underling) which mimicked P3DFFT 2.3's data movement capabilities. I wholly isolated the data movement from computing the FFTs. This, unlike P3DFFT and 2DECOMP, provides well-defined memory layout at any stage during the pencil transposes. Functionally my first approach was quite sound. However, it performs suboptimal on-node memory reshuffling (a design mistake on my part) and can stand to be improved.

I am revisiting the assumption of separating the parallel FFTs from the parallel data movement. Doing so allows using the higher-level 2D r2c/c2r planning APIs freshly documented for 3.3-beta. It should require less needless memory reshuffling when combined with appropriate FFTW_MPI_TRANPOSED_IN and FFTW_MPI_TRANSPOSED_OUT flag choices.

Because, though wonderfully written, the relevant sections of the FFTW MPI documentation do not provide a cheat sheet, I have cooked my own for the various piece parts I may use in a second attempt. Perhaps someone will find it useful. Be sure to check the FFTW MPI reference for full details, especially the local_size calls necessary to obtain data distribution information.

Please tell me if you catch any mistakes. Many thanks to Steven G. Johnson for correcting my earlier misunderstanding of the real-to-complex and complex-to-real 2D DFT semantics.

							transposed in					transposed out					transposed in\|out
transpose	in	n0/P	×	n1	×	nc	n1	×	n0/P	×	nc	n0/P	×	n1	×	nc	n1	×	n0/P	×	nc
transpose	out	n1/P	×	n0	×	nc	n1/P	×	n0	×	nc	n0	×	n1/P	×	nc	n0	×	n1/P	×	nc
c2c 2D	in	ñ0/P	×	ñ1	×	ñc	ñ1/P	×	ñ0	×	ñc	ñ0/P	×	ñ1	×	ñc	ñ1/P	×	ñ0	×	ñc
c2c 2D	out	ñ0/P	×	ñ1	×	ñc	ñ0/P	×	ñ1	×	ñc	ñ1/P	×	ñ0	×	ñc	ñ1/P	×	ñ0	×	ñc
r2c 2D	in	n0/P	×	2(n1/2+1)	×	nc	2(n1/2+1)/P	×	n0	×	nc	n0/P	×	2(n1/2+1)	×	nc	2(n1/2+1)/P	×	n0	×	nc
r2c 2D	out	ñ0/P	×	(ñ1/2+1)	×	ñc	ñ0/P	×	(ñ1/2+1)	×	ñc	(ñ1/2+1)/P	×	ñ0	×	ñc	(ñ1/2+1)/P	×	ñ0	×	ñc
c2r 2D	in	ñ0/P	×	ñ1/2+1	×	ñc	(ñ1/2+1)/P	×	ñ0	×	ñc	ñ0/P	×	ñ1/2+1	×	ñc	(ñ1/2+1)/P	×	ñ0	×	ñc
c2r 2D	out	n0/P	×	2(n1/2+1)	×	nc	n0/P	×	2(n1/2+1)	×	nc	2(n1/2+1)/P	×	n0	×	nc	2(n1/2+1)/P	×	n0	×	nc

Notation: Real-valued directions are denoted n0 and n1 while complex-valued directions are ñ0 and ñ1. Half-complex storage is denoted ñ/2+1 and its padded, real-valued counterpart is 2(n/2+1). Directions decomposed along a communicator with P processes are denoted n/P. nc stands for "number of components" and corresponds to the advanced planning API's howmany arguments.

05 July 2011

My divergence divergence.

I've spent three full work days chasing down a verification failure within my spectral, compressible Navier—Stokes code. I can't remember the last time I was so happy to average 1/3rd of a source line per day.

26 June 2011

Using Boost Spirit 2.1+ to evaluate constant arithmetic expressions

I used Boost Program_options for my research code's input parsing requirements. As expected, it works well. For floating point quantities, I've found having to specify 4.18879020478639 for 4π/3 is a bit cumbersome. When specifying arguments on the command line, I could use any one of a number of hacks to perform floating point arithmetic in bash. But that didn't help me when taking input from response files. I wanted an arithmetic expression utility that could process a C-like grammar. Long double accuracy was a requirement. I looked around a bit. GNU libmatheval and muParser appeared to be good fits but looked to be double precision-only. Though straightforward, I decided against combining the Shunting-yard algorithm with some in-place RPN logic because I wasn't enthusiastic about either finding or writing an appropriate (and admittedly toy simple) lexer.

Already having a Boost pre-requisite in my code, I decided to try out Boost Spirit since it could provide a header-only, precision-agnostic solution. Spirit has gone through a lot of significant revisions and many articles seemed to be written for the older Spirit Classic. The latest official reference manual gave several excellent examples that almost fit. I adapted Spirit's calc3.cpp and roman.cpp sample grammars to accomplish what I wanted. I added ** as an operator for exponentiation and pi as a built-in constant. I also added a smattering of special functions (e.g. exp). The task required about 150 lines of Spirit-based code. Further extensions should be fairly straightforward.
My sample code does the following:

Declares X-macros for the unary and binary functions built into the grammar.
Declares Boost Phoenix-ready functors for the unary and binary functions.
Declares expression::grammar consisting of the rules expression, term, factor, and primary.
Declares a helper called parse to hide needing boost::spirit::ascii::space when performing whitespace-agnostic parsing.
Declares a small test macro called EXPRTEST to reduce test case boilerplate.
Runs a collection of tests to ensure the parser is behaving as expected.

Compared to the samples on which I built, the biggest changes to the grammar were adding primary and reworking factor. I could have shortened up some details with a few using boost::spirit::qi; declarations, but I wanted to keep track of exactly which parts of Spirit I used. Feedback and suggestions are most welcome.

Updated 9 Jan 2012: During a second pass motivated by reducing compilation times, I removed the X-macro nastiness and replaced it with some qi::symbols-based logic. The code in the anonymous namespace could likely be removed if I could figure out the right boost::phoenix::bind hoodoo to use to replace lazy_ufunc_ and lazy_bfunc_ (any suggestions?). I have not yet measured the difference in either memory footprint or runtime performance. Compilation time and memory is greatly reduced for both GNU and Intel compilers.

Updated 31 Mar 2013: I have moved the code into a gist to facilitate both maintenance and gathering feedback. I have added the constant e and slightly reordered the primary grammar per the comments.

Updated 17-19 Sept 2013: I have added an MPL2 header to the gist. In the comments is a suggestion for how to use standalone MPL2 logic building atop this grammar from my project Suzerain to incorporate such parsing into your own, possibly non-MPL2, source tree.

24 May 2011

Adding enthought.traits.api.HasTraits as an ancestor using Python metaclasses

Today, while banging my head against a wall trying to get a SWIG-generated Python module to play nice with Python Traits, I cooked a way to use Python metaclasses to modify a class to add enthought.traits.has_traits.HasTraits as a base class:

from enthought.traits.api import HasTraits
from enthought.traits.has_traits import MetaHasTraits

# See http://onlamp.com/lpt/a/3388 for an intro to Python metaclasses and
# http://code.activestate.com/recipes/204197-solving-the-metaclass-conflict/
# for what can go wrong here
class AddHasTraitsAncestor(MetaHasTraits):
    def __new__(cls, name, bases, dct):
        return MetaHasTraits.__new__(cls, name, cls.modify(bases), dct)

    def __init__(cls, name, bases, dct):
        super(AddHasTraitsAncestor, cls).__init__(name, cls.modify(bases), dct)

    @staticmethod
    def modify(bases):
        if (bases == (object,)):
            return (HasTraits,)
        else:
            new_bases = list(bases)
            new_bases.insert(0, HasTraits)
            return tuple(new_bases)

Adding the line __metaclass__ = AddHasTraitsAncestor to a class definition is enough, in some cases, for the class to now have HasTraits as a base class. Most of the frustrating bits were before I realized HasTraits has its own metaclass MetaHasTraits which AddHasTraitsAncestor had to subclass before the approach worked.

"Uhhh...?" you say. That puzzled look asking "Why didn't you just add the base class yourself?". I used this approach because SWIG was generating the Python class definitions and I was abusing SWIG's pythoncode feature to tweak the SWIG wrappers to be Traits-ready a la

%extend someCPlusPlusClass {

    %pythoncode %{
        __metaclass__ = AddHasTraitsAncestor
    %}

}

Turns out it works and my SWIG wrappers are now Traits-friendly. I'm still working on getting Traits to see the SWIG-based members, however. Suggestions much appreciated.

16 April 2011

Using gnuplot from the command line

gnuplot is excellent for both quick 'n' dirty and lengthy plotting projects. Still, for truly quick work of just plottin' some column against some other column, I've always felt like gnuplot demands too many keystrokes:

[1000 rhys@tiresias 16]$ gnuplot

 G N U P L O T
 Version 4.2 patchlevel 6
 last modified Sep 2009
 System: Linux 2.6.32-30-generic

 Copyright (C) 1986 - 1993, 1998, 2004, 2007 - 2009
 Thomas Williams, Colin Kelley and many others

 Type `help` to access the on-line reference manual.
 The gnuplot FAQ is available from http://www.gnuplot.info/faq/

 Send bug reports and suggestions to 


Terminal type set to 'x11'
gnuplot> plot 'restart0.mean' using 2:17
gnuplot> exit

Of course, one can turn this process into a one-liner at the cost of quoting:

gnuplot -e "plot 'restart0.mean' using 2:17"

Call me picky, but the signal-to-noise ratio in that one-liner still feels too high.

After some quoting experimentation using printf's %q and %b format specifiers, the following little function definition

gplot () {
   local fileexpr=$(printf "'%q'" $1)
   shift
   local plotexpr=$(printf '%b' "plot ${fileexpr} $*")
   gnuplot -persist -raise -e "$plotexpr"
}

makes

gplot restart0.mean using 2:17

behave like the previous two examples. This function happily hides the file name quoting requirements and remains tab-completion-friendly.

Providing computed column values like gnuplot's using 1:$(cos($2)) on the command line still requires thought, but the straightforward tasks work thoughtlessly. Suggestions for improvements definitely welcome in the comments. One glaring bug is that passing a pipe expression as the file bombs.

Update 20110504: I've decided that I much prefer creating a gplot script containing

#!/bin/bash
file=$1
shift
gnuplot -persist -raise <(cat <<-GNUPLOTSCRIPT
plot '$file' $*
GNUPLOTSCRIPT
)

as it is much simpler to work through the quoting and far more extensible.

Update 20111031: Globbing, labelling, and a handful of other enhancements:

#!/bin/bash

# Fail on first error
set -e

# Create temporary files to hold gnuplot script
tmp1=`mktemp`
tmp2=`mktemp`
trap "rm -f $tmp1 $tmp2" EXIT

# Process (and then remove) command line options
# Build up any post-terminal processing in tmp2 for later use
autotitle=
title="`date +%r`: gplot $*"
terminal="set terminal x11 title '$title' enhanced persist"
cmd=plot
forexpr=
raise=
showhelp=
while getopts "3cf:ghils:re:t:x:y:z:F:SX:Y:Z:" opt
do
  case $opt in
    3) cmd=splot
       ;;
    c) autotitle=true
       ;;
    e) terminal='set term postscript eps enhanced color dashed rounded "Arial,14"'
       pause='' # Disable interactive on EPS output
       echo set output \"$OPTARG\" >> $tmp2
       ;;
    f) forexpr=" for [$OPTARG] "
       ;;
    g) echo "set grid" >> $tmp2
       ;;
    h) showhelp=0
       ;;
    i) pause='pause -1 "Plot interactive until Enter pressed.\nHit h in plot window for help.\n"'
       ;;
    l) echo "set logscale y" >> $tmp2
       ;;
    r) raise="-raise"
       ;;
    s) savescript="$OPTARG"
       ;;
    t) echo set title \"$OPTARG\" >> $tmp2
       ;;
    x) echo set xlabel \"$OPTARG\" >> $tmp2
       ;;
    y) echo set ylabel \"$OPTARG\" >> $tmp2
       ;;
    z) echo set zlabel \"$OPTARG\" >> $tmp2
       ;;
    F) pause="pause $OPTARG; replot; reread"
       ;;
    S) stdin=true
       ;;
    X) echo set xrange \[$OPTARG\] >> $tmp2
       ;;
    Y) echo set yrange \[$OPTARG\] >> $tmp2
       ;;
    Z) echo set zrange \[$OPTARG\] >> $tmp2
       ;;
  esac
done
shift $((OPTIND-1))

if [ x$showhelp != x ]; then
    cat <<-HERE
Usage: gplot [OPTION]... (FILE|EXTGLOB) GNUPLOTCMD...
Use gnuplot to plot one or more files directly from the command line.

  -3             Perform 3D plotting using gnuplot's splot command.
  -c             Populate the key using autotitling.
  -e FILE        Save an Encapsulated Postscript (eps) called FILE.
  -f FOREXPR     Prepend a 'for [FOREXPR]' to the plotting command.
  -g             Show grid lines.
  -h             Show this help message.
  -i             Interactive plotting mode.  Hit 'h' on plot for help.
  -l             Use logarithmic scale for y axis.
  -s FILE        Save the generated gnuplot as a script called FILE.
  -r             Have the window manager raise the plot window.
  -t TITLE       Set TITLE as the plot's title.
  -x XLABEL      Specify XLABEL as the x axis label.
  -y YLABEL      Specify YLABEL as the y axis label.
  -z ZLABEL      Specify ZLABEL as the z axis label.
  -F FREQUENCY   Replot the inputs every FREQUENCY seconds.
  -S             Prior to plotting, read auxililary gunplot from stdin.
  -X XLOW:XHIGH  Specify an explicit x axis range instead of autoscaling.
  -Y YLOW:YHIGH  Specify an explicit y axis range instead of autoscaling.
  -Z ZLOW:ZHIGH  Specify an explicit z axis range instead of autoscaling.

Examples (see gnuplot documentation for complete GNUPLOTCMD details):

  gplot -i foo.dat using 1:2 with linespoints
  gplot -s foo.gp -X 0:1 -Y 0:2 foo.dat using 1:2 with linespoints
  gplot -e foo.eps foo.dat using 1:2 with linespoints
  gplot -3 restart\*.dat using 1:2:3

On error, the failing gnuplot script is shown.
HERE
    exit $showhelp
fi

# Set terminal
echo "$terminal" >> $tmp1

# Slurp any settings built up during getops processing
cat $tmp2 >> $tmp1

# Obtain file(s) to plot from first argument using extended globbing
# Deliberately allow globbing to occur in this assignment
shopt -s extglob
declare -a files=($1)
shift

# Tweak autotitle based on options and incoming argument details
if [ "$autotitle" ]; then
    # Use columnheader option iff only one file provided
    if [ ${#files[@]} -eq 1 -a $cmd != splot ]; then
        echo 'set key autotitle columnheader' >> $tmp1
    else
        echo 'set key autotitle' >> $tmp1
    fi
else
    echo 'set key noautotitle' >> $tmp1
fi

# Possibly slurp standard input for further options
# FIXME Not working for 'echo foo | gplot'
if [ "$stdin" ]; then
    cat <&1 >> $tmp1
fi

# Build gnuplot script to plot possibly multiple files
declare -i count=0
for file in "${files[@]}"
do
    count+=1
    if [ $count -eq 1 ]; then
        echo "$cmd" "$forexpr" \'$file\' $* \\ >> $tmp1
    else
        echo "   "  , "$forexpr" \'$file\' $* \\ >> $tmp1
    fi
done
echo '' >> $tmp1

# If requested, save the plotting commands as an executable script
if [ "$savescript" ]; then
    echo '#!/usr/bin/env gnuplot' > "$savescript"
    cat "$tmp1" >> "$savescript"
    chmod a+x "$savescript"
fi

# If requested, add a pause command to the end of the script. Notice this was
# not added to the $savescript as it would be annoying in that context.
if [ "$pause" ]; then
    echo "$pause" >> $tmp1
fi

# Generate the plot
# Display the script when an error occurs to aid debugging
gnuplot $raise $tmp1 || cat $tmp1

01 April 2011

Noise.

Following up on some quasi-1D success I'm ready to calculate some 3D fields. I seed them by taking my 1D solution and add noise. This is purportedly a terrible way to kick off turbulence as the fluctuations have to be massive to ultimately take and transition into a correct field. I've been warned...

Needing a way to monitor the fluctuation strengths and being in coefficient-space for a mixed Fourier/B-Spline discretization, I'm trying out the L^2 norm of the fluctuating signal. By Parseval's theorem along with some minor linear algebra, this turns into a sum over the complex-valued coefficient magnitudes: Computation of the L^2 norm in a mixed Fourier/B-spline collocation discretization

Now, for the first case of the L^2 norm over all three directions I need the matrix M which computes the L^2 inner product of two functions given their B-spline coefficient expansions. The code to do this atop some old contributions to the GSL isn't bad, but testing that my banded derivative operator coefficients are correct is awful.

Happily, Mathematica 7 includes BSplineBasis which I can use to cook analytic results to test my implementation. Unhappily, BSplineBasis has issues computing derivatives and an awful amount of mucking around is required to get my analytical test cases: Finding Galerkin L^2-based Operators for B-spline discretizations

In the end, this is all The Right Thing ™ to do but man it eats time. Recognizing this particular rabbit hole and how often such things ensnare my little mind, I sent the following email to my wonderful wife today:

In case you ever wonder what I do with my days....

I stare at things like this for far too long to work out the math.

Then I sit and stare at the math for far too long to figure out the computer implementation.

Then I stare at the computer implementation to test it and clean it up.

Then I use the implementation at a single line within my thesis research code.

Then I go talk to my advisor on Mondays and report that a single line now works this week that didn't work last week.

— Rhys

10 March 2011

Steppin'

The code I've been working on for, quite literally, years is finally coming together. It's called Suzerain. Suzerain is a mixed Fourier, B-spline spectral DNS code for solving the compressible Navier–Stokes equations for a perfect gas. Nearly every technical thing I've written on this blog has been about it in some way.

My first sanity check has been advancing a quasi-1D, laminar channel flow profile to a steady state condition. Some plots follow with units nondimensionalized by the bulk velocity, channel half width, and wall temperature and pressure. The initial conditions were constant density and temperature along with a parabolic velocity profile. Happily, the simulation relaxes the way I expect.

Onward.

25 January 2011

Converting to and from human-readable byte counts

Converting some number of bytes (say 1024) into a human-readable byte count (say "1K") seems to be a common problem (simply search or ask StackOverflow). I also needed to go the other direction and turn things like "1.5 MB" into 1,572,864. Here's what I cooked up in C99:

// Adapted from http://stackoverflow.com/questions/3758606/
// how-to-convert-byte-size-into-human-readable-format-in-java
void to_human_readable_byte_count(long bytes,
                                  int si,
                                  double *coeff,
                                  const char **units)
{
    // Static lookup table of byte-based SI units
    static const char *suffix[][2] = { { "B",  "B"   },
                                       { "kB", "KiB" },
                                       { "MB", "MiB" },
                                       { "GB", "GiB" },
                                       { "TB", "TiB" },
                                       { "EB", "EiB" },
                                       { "ZB", "ZiB" },
                                       { "YB", "YiB" } };
    int unit = si ? 1000 : 1024;
    int exp = 0;
    if (bytes > 0) {
        exp = min( (int) (log(bytes) / log(unit)),
                   (int) sizeof(suffix) / sizeof(suffix[0]) - 1);
    }
    *coeff = bytes / pow(unit, exp);
    *units  = suffix[exp][!!si];
}

// Convert strings like the following into byte counts
//    5MB, 5 MB, 5M, 3.7GB, 123b, 456kB
// with some amount of forgiveness baked into the parsing.
long from_human_readable_byte_count(const char *str)
{
    // Parse leading numeric factor
    char *endptr;
    errno = 0;
    const double coeff = strtod(str, &endptr);
    if (errno) return -1;

    // Skip any intermediate white space
    while (isspace(*endptr)) ++endptr;

    // Read off first character which should be an SI prefix
    int exp  = 0;
    int unit = 1024;
    switch (toupper(*endptr)) {
        case 'B':  exp =  0; break;
        case 'K':  exp =  3; break;
        case 'M':  exp =  6; break;
        case 'G':  exp =  9; break;
        case 'T':  exp = 12; break;
        case 'E':  exp = 15; break;
        case 'Z':  exp = 18; break;
        case 'Y':  exp = 21; break;

        case ' ':
        case '\t':
        case '\0': exp =  0; goto done;

        default:   return -1;
    }
    ++endptr;

    // If an 'i' or 'I' is present use SI factor-of-1000 units
    if (toupper(*endptr) == 'I') {
        ++endptr;
        unit = 1000;
    }

    // Next character must be one of B/empty/whitespace
    switch (toupper(*endptr)) {
        case 'B':
        case ' ':
        case '\t': ++endptr;  break;

        case '\0': goto done;

        default:   return -1;
    }

    // Skip any remaining white space
    while (isspace(*endptr)) ++endptr;

    // Parse error on anything but a null terminator
    if (*endptr) return -1;

done:
    return exp ? coeff * pow(unit, exp / 3) : coeff;
}

15 January 2011

Using Argp with MPI-based applications

Argp is a great, great parser for command line options. When using it in MPI-based applications, there's a catch in that you want only one MPI rank to print --help information, usage warnings, error messages, etc. Otherwise, you get a whole mess of repeated, jumbled output as each MPI rank squawks about the same problem.

Here's a wrapper for Argp for use in MPI-based applications that solves just this nuisance:

#include <stdio.h>
#include <unistd.h>
#include "argp.h"

/**
 * Call <a href="http://www.gnu.org/s/libc/manual/html_node/Argp.html"
 * >Argp</a>'s \c argp_parse in an MPI-friendly way.  Processes
 * with nonzero rank will have their \c stdout and \c stderr redirected
 * to <tt>/dev/null</tt> during \c argp_parse.
 *
 * @param rank MPI rank of this process.  Output from \c argp_parse
 *             will only be observable from rank zero.
 * @param argp      Per \c argp_parse semantics.
 * @param argc      Per \c argp_parse semantics.
 * @param argv      Per \c argp_parse semantics.
 * @param flags     Per \c argp_parse semantics.
 * @param arg_index Per \c argp_parse semantics.
 * @param input     Per \c argp_parse semantics.
 *
 * @return Per \c argp_parse semantics.
 */
error_t mpi_argp_parse(const int rank,
                       const struct argp *argp,
                       int argc,
                       char **argv,
                       unsigned flags,
                       int *arg_index,
                       void *input);

error_t mpi_argp_parse(const int rank,
                       const struct argp *argp,
                       int argc,
                       char **argv,
                       unsigned flags,
                       int *arg_index,
                       void *input)
{
    // Flush stdout, stderr
    if (fflush(stdout))
        perror("mpi_argp_parse error flushing stdout prior to redirect");
    if (fflush(stderr))
        perror("mpi_argp_parse error flushing stderr prior to redirect");

    // Save stdout, stderr so we may restore them later
    int stdout_copy, stderr_copy;
    if ((stdout_copy = dup(fileno(stdout))) < 0)
        perror("mpi_argp_parse error duplicating stdout");
    if ((stderr_copy = dup(fileno(stderr))) < 0)
        perror("mpi_argp_parse error duplicating stderr");

    // On non-root processes redirect stdout, stderr to /dev/null
    if (rank) {
        if (!freopen("/dev/null", "a", stdout))
            perror("mpi_argp_parse error redirecting stdout");
        if (!freopen("/dev/null", "a", stderr))
            perror("mpi_argp_parse error redirecting stderr");
    }

    // Invoke argp per http://www.gnu.org/s/libc/manual/html_node/Argp.html
    error_t retval = argp_parse(argp, argc, argv, flags, arg_index, input);

    // Flush stdout, stderr again
    if (fflush(stdout))
        perror("mpi_argp_parse error flushing stdout after redirect");
    if (fflush(stderr))
        perror("mpi_argp_parse error flushing stderr after redirect");

    // Restore stdout, stderr
    if (dup2(stdout_copy, fileno(stdout)) < 0)
        perror("mpi_argp_parse error reopening stdout");
    if (dup2(stderr_copy, fileno(stderr)) < 0)
        perror("mpi_argp_parse error reopening stderr");

    // Close saved versions of stdout, stderr
    if (close(stdout_copy))
        perror("mpi_argp_parse error closing stdout_copy");
    if (close(stderr_copy))
        perror("mpi_argp_parse error closing stderr_copy");

    // Clear any errors that may have occurred on stdout, stderr
    clearerr(stdout);
    clearerr(stderr);

    // Return what argp_parse returned
    return retval;
}

04 January 2011

C header-only unit testing with FCTX

Just a quick hat tip to FCTX, a library I've found invaluable these past few months. FCTX provides header-only unit testing for C. Sure, if you're in C++ land there's a ton of xUnit-like frameworks available (with Boost.Test being my favorite), but for vanilla C projects FCTX wins hands down.

As an example, here's something I put together for a Stack Overflow response: The logic isn't rocket science, of course. But testing it in C without resorting to external libraries and complicated makefiles shouldn't be rocket science either. Provided that fct.h is in the same directory, this source will compile and run.

The Return of Agent Zlerich