JCublas

Java bindings for CUBLAS



JCublas is a library that makes it it possible to use CUBLAS, the NVIDIA CUDA implementation of the Basic Linear Algebra Subprograms, in Java applications.

JCublas provides methods for in single and double precision (but note that not all CUDA capable devices support double precision computations).

You may obtain the latest version of JCublas and the JCublas source code in the Downloads section.



JCublas and JCublas2


The JCublas library contains two main classes: The JCublas class provides bindings for the classical CUBLAS functions. Starting with CUDA 4.0, NVIDIA has added a new API for CUBLAS, referred to as "CUBLAS V2". This new API is available through the JCublas2 class.


Documentation


You may either browse the JCublas API documentation here, or download the JCublas documentation in a ZIP file from the Downloads section.

Most of the documentation is directly taken from the CUBLAS header files.



Application


The following table shows a comparison of a sample program performing a 'sgemm' operation using CUBLAS. The left side shows the C code, which is adapted from the "Simple CUBLAS" code sample at the NVIDIA developer download website (for simplicity, error checks are omitted here). The right side shows the same operation performed with JCublas. Obviously, the usage of JCublas is very similar to the usage of plain CUBLAS in C.

You may also want to download a complete, compileable JCublas sample from the samples page, which performs a 'sgemm' operation, once in plain Java and once in JCublas, and verifies the result.
Simple CUBLAS in C Simple JCublas in Java
/* Includes, cuda */
#include "cublas.h"




/* Matrix size */
#define (275)

/* Main */
int main(int argc, char** argv)
{
  float* h_A;
  float* h_B;
  float* h_C;
  float* d_A = 0;
  float* d_B = 0;
  float* d_C = 0;
  float alpha = 1.0f;
  float beta = 0.0f;
  int n2 = N * N;
  int i;

  /* Initialize CUBLAS */
  cublasInit();

  /* Allocate host memory for the matrices */
  h_A = (float*)malloc(n2 * sizeof(h_A[0]));
  h_B = (float*)malloc(n2 * sizeof(h_B[0]));
  h_C = (float*)malloc(n2 * sizeof(h_C[0]));

  /* Fill the matrices with test data */
  for (i = 0; i < n2; i++)
  {
    h_A[i= rand() (float)RAND_MAX;
    h_B[i= rand() (float)RAND_MAX;
    h_C[i= rand() (float)RAND_MAX;
  }

  /* Allocate device memory for the matrices */
  cublasAlloc(n2, sizeof(d_A[0])(void**)&d_A);
  cublasAlloc(n2, sizeof(d_B[0])(void**)&d_B);
  cublasAlloc(n2, sizeof(d_C[0])(void**)&d_C);

  /* Initialize the device matrices with the host matrices */
  cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
  cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
  cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

  /* Performs operation using cublas */
  cublasSgemm('n''n', N, N, N, alpha,
        d_A, N, d_B, N, beta, d_C, N);

  /* Read the result back */
  cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

  /* Memory clean up */
  free(h_A);
  free(h_B);
  free(h_C);
  cublasFree(d_A);
  cublasFree(d_B);
  cublasFree(d_C);

  /* Shutdown */
  cublasShutdown();

  return EXIT_SUCCESS;
}

/* Imports, JCublas */
import jcuda.*;
import jcuda.jcublas.*;

class JCublasSample
{
  /* Matrix size */
  private static final int N = 275;

  /* Main */
  public static void main(String args[])
  {
    float h_A[];
    float h_B[];
    float h_C[];
    Pointer d_A = new Pointer();
    Pointer d_B = new Pointer();
    Pointer d_C = new Pointer();
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;

    /* Initialize JCublas */
    JCublas.cublasInit();

    /* Allocate host memory for the matrices */
    h_A = new float[n2];
    h_B = new float[n2];
    h_C = new float[n2];

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
      h_A[i(float)Math.random();
      h_B[i(float)Math.random();
      h_C[i(float)Math.random();
    }

    /* Allocate device memory for the matrices */
    JCublas.cublasAlloc(n2, Sizeof.FLOAT, d_A);
    JCublas.cublasAlloc(n2, Sizeof.FLOAT, d_B);
    JCublas.cublasAlloc(n2, Sizeof.FLOAT, d_C);

    /* Initialize the device matrices with the host matrices */
    JCublas.cublasSetVector(n2, Sizeof.FLOAT, Pointer.to(h_A)1, d_A, 1);
    JCublas.cublasSetVector(n2, Sizeof.FLOAT, Pointer.to(h_B)1, d_B, 1);
    JCublas.cublasSetVector(n2, Sizeof.FLOAT, Pointer.to(h_C)1, d_C, 1);

    /* Performs operation using JCublas */
    JCublas.cublasSgemm('n''n', N, N, N, alpha,
              d_A, N, d_B, N, beta, d_C, N);

    /* Read the result back */
    JCublas.cublasGetVector(n2, Sizeof.FLOAT, d_C, 1, Pointer.to(h_C)1);

    /* Memory clean up */



    JCublas.cublasFree(d_A);
    JCublas.cublasFree(d_B);
    JCublas.cublasFree(d_C);

    /* Shutdown */
    JCublas.cublasShutdown();

  }
}