CUDA Kernels API Reference
This section documents the CUDA implementations for GPU-accelerated operations in AnalysisG.
Overview
The PyC package contains CUDA kernels for:
Physics calculations (ΔR, invariant mass, etc.)
Graph operations (PageRank, message passing)
Neutrino reconstruction
Mathematical operators
Coordinate transformations
Atomic operations
All CUDA code is in src/AnalysisG/pyc/*/cuda/ and *.cu files.
CUDA Architecture
AnalysisG CUDA code follows these patterns:
Kernel Functions: Marked with
__global__for device executionDevice Functions: Marked with
__device__for kernel-only callsHost Functions: CPU interface that launches kernels
Memory Management: Efficient CPU ↔ GPU transfers
Error Checking: CUDA error handling throughout
Graph Operations (pyc/graph)
PageRank CUDA Implementation
Location: pyc/graph/cuda/pagerank.cu
GPU-accelerated PageRank algorithm for graph analysis.
Kernel Signature:
__global__ void pagerank_kernel(
const int* edge_index, // Edge connectivity [2, num_edges]
float* pagerank_scores, // Output scores [num_nodes]
const int num_nodes,
const int num_edges,
const float damping, // Damping factor (typically 0.85)
const int max_iterations
);
Usage Pattern:
// Host code
void pagerank_cuda(
torch::Tensor edge_index,
torch::Tensor scores,
float damping = 0.85f,
int max_iters = 100
) {
int num_nodes = scores.size(0);
int num_edges = edge_index.size(1);
// Launch kernel
int block_size = 256;
int num_blocks = (num_nodes + block_size - 1) / block_size;
pagerank_kernel<<<num_blocks, block_size>>>(
edge_index.data_ptr<int>(),
scores.data_ptr<float>(),
num_nodes, num_edges, damping, max_iters
);
}
Performance: Processes millions of nodes/edges efficiently on GPU.
Graph Construction
Location: pyc/graph/cuda/graph.cu
Builds graph structures from particle data.
Key Kernels:
// Build edge list based on distance threshold
__global__ void build_edges_kernel(
const float* positions, // [num_particles, 3]
int* edge_index, // Output [2, max_edges]
int* num_edges, // Output edge count
const int num_particles,
const float threshold // Max distance for edge
);
// Compute edge features (ΔR, etc.)
__global__ void edge_features_kernel(
const float* node_features, // [num_nodes, feat_dim]
const int* edge_index,
float* edge_features, // Output
const int num_edges
);
Physics Calculations (pyc/physics)
DeltaR Calculation
Location: pyc/physics/cuda/physics.cu
Computes angular separation between particles.
Kernel:
__device__ float compute_deltaR(
float eta1, float phi1,
float eta2, float phi2
) {
float deta = eta1 - eta2;
float dphi = phi1 - phi2;
// Wrap phi to [-π, π]
while (dphi > M_PI) dphi -= 2*M_PI;
while (dphi < -M_PI) dphi += 2*M_PI;
return sqrtf(deta*deta + dphi*dphi);
}
__global__ void deltaR_matrix_kernel(
const float* eta, // [N]
const float* phi, // [N]
float* dr_matrix, // Output [N, N]
const int N
) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N) {
dr_matrix[i*N + j] = compute_deltaR(
eta[i], phi[i], eta[j], phi[j]
);
}
}
Usage: Computes all-pairs ΔR matrix for N particles in O(N²) but parallelized on GPU.
Invariant Mass
Kernel:
__global__ void invariant_mass_kernel(
const float* px, // [N]
const float* py,
const float* pz,
const float* e,
const int* combinations, // [M, K] - which particles to combine
float* masses, // Output [M]
const int M, // Number of combinations
const int K // Particles per combination
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= M) return;
// Sum four-momentum
float total_px = 0, total_py = 0, total_pz = 0, total_e = 0;
for (int k = 0; k < K; k++) {
int particle_idx = combinations[idx * K + k];
total_px += px[particle_idx];
total_py += py[particle_idx];
total_pz += pz[particle_idx];
total_e += e[particle_idx];
}
// Compute invariant mass
float p2 = total_px*total_px + total_py*total_py + total_pz*total_pz;
masses[idx] = sqrtf(total_e*total_e - p2);
}
Neutrino Reconstruction (pyc/nusol)
Location: pyc/nusol/cuda/
GPU-accelerated neutrino reconstruction using multiple algorithms.
Analytical W → lν Solver
File: analytical_w.cu
Solves for neutrino pz given W boson mass constraint.
Kernel:
__global__ void solve_neutrino_pz_kernel(
const float* lepton_px, // [N]
const float* lepton_py,
const float* lepton_pz,
const float* lepton_e,
const float* met_px, // Missing ET x
const float* met_py, // Missing ET y
float* nu_pz_solutions, // Output [N, 2] (two solutions)
int* num_solutions, // Output [N] (0, 1, or 2)
const float W_mass,
const int N
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) return;
// Solve quadratic equation for neutrino pz
// (details omitted for brevity)
float a = /* ... */;
float b = /* ... */;
float c = /* ... */;
float discriminant = b*b - 4*a*c;
if (discriminant < 0) {
num_solutions[idx] = 0;
} else if (discriminant == 0) {
nu_pz_solutions[idx*2] = -b / (2*a);
num_solutions[idx] = 1;
} else {
float sqrt_d = sqrtf(discriminant);
nu_pz_solutions[idx*2] = (-b + sqrt_d) / (2*a);
nu_pz_solutions[idx*2 + 1] = (-b - sqrt_d) / (2*a);
num_solutions[idx] = 2;
}
}
Performance: Solves thousands of neutrino reconstruction problems in parallel.
Numerical Optimization
File: numerical_optimizer.cu
Chi-squared minimization for top → Wb → lνb reconstruction.
Key Features:
Parallel gradient descent
Constraint handling
Multiple initial conditions
Best solution selection
Mathematical Operators (pyc/operators)
Location: pyc/operators/cuda/operators.cu
CUDA implementations of common mathematical operations.
Matrix Operations
__global__ void matrix_multiply_kernel(
const float* A, // [M, K]
const float* B, // [K, N]
float* C, // Output [M, N]
int M, int K, int N
);
__global__ void transpose_kernel(
const float* input, // [M, N]
float* output, // [N, M]
int M, int N
);
Element-wise Operations
__global__ void relu_kernel(float* data, int N);
__global__ void sigmoid_kernel(float* data, int N);
__global__ void softmax_kernel(float* data, int N, int dim);
Coordinate Transforms (pyc/transform)
Location: pyc/transform/cuda/transform.cu
Coordinate system transformations.
Cartesian ↔ Spherical
__global__ void cartesian_to_spherical_kernel(
const float* px,
const float* py,
const float* pz,
float* pt, // Output
float* eta, // Output
float* phi, // Output
int N
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) return;
float px_val = px[idx];
float py_val = py[idx];
float pz_val = pz[idx];
// pt = sqrt(px² + py²)
pt[idx] = sqrtf(px_val*px_val + py_val*py_val);
// phi = atan2(py, px)
phi[idx] = atan2f(py_val, px_val);
// eta = -ln(tan(θ/2)) where θ = atan2(pt, pz)
float theta = atan2f(pt[idx], pz_val);
eta[idx] = -logf(tanf(theta / 2.0f));
}
Lorentz Boosts
__global__ void lorentz_boost_kernel(
const float* px_in, const float* py_in,
const float* pz_in, const float* e_in,
float* px_out, float* py_out,
float* pz_out, float* e_out,
float beta_x, float beta_y, float beta_z,
int N
);
Atomic Operations (pyc/cutils)
Location: pyc/cutils/cuda/atomic.cu
Thread-safe atomic operations for CUDA.
Custom Atomics
__device__ float atomicMaxFloat(float* address, float val) {
int* address_as_int = (int*)address;
int old = *address_as_int, assumed;
do {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(fmaxf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
__device__ double atomicAddDouble(double* address, double val) {
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
Performance Optimization
Thread Configuration
Typical kernel launch patterns:
// 1D grid
int block_size = 256; // Threads per block
int grid_size = (N + block_size - 1) / block_size;
kernel<<<grid_size, block_size>>>(args);
// 2D grid (for matrices)
dim3 block(16, 16);
dim3 grid((M + 15) / 16, (N + 15) / 16);
kernel<<<grid, block>>>(args);
Memory Patterns
Coalesced Access: Ensure adjacent threads access adjacent memory:
// Good: Coalesced
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float val = data[idx];
// Bad: Strided access
int idx = threadIdx.x * blockDim.x + blockIdx.x;
float val = data[idx];
Shared Memory: Use for data reuse within block:
__global__ void kernel_with_shared() {
__shared__ float cache[256];
int tid = threadIdx.x;
cache[tid] = global_data[blockIdx.x * blockDim.x + tid];
__syncthreads();
// Use cache[] instead of global memory
}
Error Handling
All CUDA calls should check for errors:
#define CUDA_CHECK(call) { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error in %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
// Usage
CUDA_CHECK(cudaMalloc(&d_data, size));
kernel<<<grid, block>>>(args);
CUDA_CHECK(cudaDeviceSynchronize());
Building CUDA Code
CUDA code is compiled automatically during installation if CUDA is available:
# CUDA must be in PATH
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Install (will compile CUDA if available)
pip install -e .
Requirements:
CUDA Toolkit 11.0 or later
Compatible GPU (Compute Capability 6.0+)
nvcc compiler
See Also
C++ Modules API Reference: C++ modules documentation
../pyc/overview: PyC package overview
Complex Technical Components: Technical overview