JCudpp

Java bindings for CUDPP



This library enables Java applications to use the public interface of CUDPP, the CUDA Data Parallel Primitives Library, version 1.1.1, which contains methods for sparse-matrix-vector-multiplications, parallel scans and sorting.



General information about JCudpp


JCudpp is only a Java binding for CUDPP. That means, in order to use JCudpp, you need an installation of CUDPP - namely, the CUDPP library file, like the CUDPP.DLL for Windows, or the libCudpp.so for Linux.

By default, the required libraries are automatically installed when you install the NVIDIA CUDA SDK. The libraries are then contained in the NVIDIA Corporation\NVIDIA GPU Computing SDK\C\bin directory of the SDK. For example, the CUDPP DLL for 32 bit Windows may be found in NVIDIA Corporation\NVIDIA GPU Computing SDK\C\bin\win32\Release.


Alternatively, you may compile this library on your own, using the source code distribution of CUDPP that may be obtained from the CUDPP home page.



Documentation


You may either browse the JCudpp API documentation here, or download the JCudpp documentation in a ZIP file from the Downloads section.

Most of the documentation is directly taken from the CUDPP Public Interface API documentation on the CUDPP home page.



Application



The following table shows a comparison of a sample program performing a parallel prefix scan operation using CUDPP. The left side shows the C code, which is adapted from the "Simple CUDPP" code sample at the CUDPP home page (for simplicity, error checks are omitted here). The right side shows the same operation performed with JCudpp. The workflow and the involved operations are quite similar.

(Using JCudpp for parallel prefix sums is easy! ;-) )

There also is a complete, compileable JCudpp sample on the samples page which sorts an array of integers, once in plain Java and once in JCudpp, and verifies the result.


Simple CUDPP in C Simple JCudpp in Java
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include "cudpp.h"





// Program main
int mainint argc, char** argv)
{
    unsigned int numElements = 32768;
    unsigned int memSize = sizeoffloat* numElements;

    // allocate host memory
    float* h_idata = (float*mallocmemSize);

    // initialize the memory
    for (unsigned int i = 0; i < numElements; ++i)
    {
        h_idata[i(float) (rand() 0xf);
    }

    // allocate device memory
    float* d_idata;
    cudaMalloc((void**&d_idata, memSize);

    // copy host memory to device
    cudaMemcpy(d_idata, h_idata, memSize,
        cudaMemcpyHostToDevice);

    // allocate device memory for result
    float* d_odata;
    cudaMalloc((void**&d_odata, memSize);

    CUDPPConfiguration config;
    config.op = CUDPP_ADD;
    config.datatype = CUDPP_FLOAT;
    config.algorithm = CUDPP_SCAN;
    config.options = CUDPP_OPTION_FORWARD |
                     CUDPP_OPTION_EXCLUSIVE;

    CUDPPHandle scanplan = 0;
    cudppPlan(&scanplan, config, numElements, 10);

    // Run the scan
    cudppScan(scanplan, d_odata, d_idata, numElements);

    // allocate mem for the result on host side
    float* h_odata = (float*mallocmemSize);

    // copy result from device to host
    cudaMemcpy(h_odata, d_odata, memSize,
               cudaMemcpyDeviceToHost);

    cudppDestroyPlan(scanplan);
    free(h_idata);
    free(h_odata);
    cudaFree(d_idata);
    cudaFree(d_odata);
    return 0;
}






// includes, project
import jcuda.*;
import jcuda.runtime.*;
import jcuda.jcudpp.*;

class JCudppSample
{
    // Program main
    public static void main(String args[])
    {
        int numElements = 32768;
        int memSize = Sizeof.FLOAT * numElements;

        // allocate host memory
        float h_idata[] new float[numElements];

        // initialize the memory
        for (int i = 0; i < numElements; ++i)
        {
            h_idata[i(float)Math.random();
        }

        // allocate device memory
        Pointer d_idata = new Pointer();
        JCuda.cudaMalloc(d_idata, memSize);

        // copy host memory to device
        JCuda.cudaMemcpy(d_idata, Pointer.to(h_idata), memSize,
            cudaMemcpyKind.cudaMemcpyHostToDevice);

        // allocate device memory for result
        Pointer d_odata = new Pointer();
        JCuda.cudaMalloc(d_odata, memSize);

        CUDPPConfiguration config = new CUDPPConfiguration();
        config.op = CUDPPOperator.CUDPP_ADD;
        config.datatype = CUDPPDatatype.CUDPP_FLOAT;
        config.algorithm = CUDPPAlgorithm.CUDPP_SCAN;
        config.options = CUDPPOption.CUDPP_OPTION_FORWARD |
                         CUDPPOption.CUDPP_OPTION_EXCLUSIVE;

        CUDPPHandle scanplan = new CUDPPHandle();
        JCudpp.cudppPlan(scanplan, config, numElements, 10);

        // Run the scan
        JCudpp.cudppScan(scanplan, d_odata, d_idata, numElements);

        // allocate mem for the result on host side
        float h_odata[] new float[numElements];

        // copy result from device to host
        JCuda.cudaMemcpy(Pointer.to(h_odata), d_odata, memSize,
            cudaMemcpyKind.cudaMemcpyDeviceToHost);

        JCudpp.cudppDestroyPlan(scanplan);


        JCuda.cudaFree(d_idata);
        JCuda.cudaFree(d_odata);

    }
}