#include #include #include #define N 8192 // Host function to read matrix from file void load_matrix(const char *filename, double *matrix) { FILE *file = fopen(filename, "rb"); if (!file) { perror("Cannot read file"); exit(EXIT_FAILURE); } fread(matrix, sizeof(double), N * N, file); fclose(file); } // CUDA kernel: each thread computes one element of C __global__ void matrixMultiply(const double *A, const double *B, double *C, int n) { int row = blockIdx.y * blockDim.y + threadIdx.y; // Calculate matrix row index int col = blockIdx.x * blockDim.x + threadIdx.x; // Calculate matrix column index if (row < n && col < n) { double sum = 0; for (int k = 0; k < n; k++) { sum += A[row * n + k] * B[k * n + col]; } C[row * n + col] = sum; } } int main() { char matrix_a_path[256], matrix_b_path[256]; sprintf(matrix_a_path, "../dataset/%d/matrix_A.bin", N); sprintf(matrix_b_path, "../dataset/%d/matrix_B.bin", N); double *h_A = (double *)malloc(N * N * sizeof(double)); double *h_B = (double *)malloc(N * N * sizeof(double)); double *h_C = (double *)malloc(N * N * sizeof(double)); if (!h_A || !h_B || !h_C) { perror("Memory allocation failed"); exit(EXIT_FAILURE); } // Load matrices from files load_matrix(matrix_a_path, h_A); load_matrix(matrix_b_path, h_B); // Allocate device memory double *d_A, *d_B, *d_C; cudaMalloc((void**)&d_A, N * N * sizeof(double)); cudaMalloc((void**)&d_B, N * N * sizeof(double)); cudaMalloc((void**)&d_C, N * N * sizeof(double)); // Copy data from host to device cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, N * N * sizeof(double), cudaMemcpyHostToDevice); // Define block and grid dimensions (using 16x16 blocks) dim3 threadsPerBlock(16, 16); dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y); // Execute CUDA kernel matrixMultiply<<>>(d_A, d_B, d_C, N); cudaDeviceSynchronize(); // Copy result back from device to host cudaMemcpy(h_C, d_C, N * N * sizeof(double), cudaMemcpyDeviceToHost); // Free memory cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); return 0; }