81 lines
2.4 KiB
Text
81 lines
2.4 KiB
Text
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#define N 8192
|
|
|
|
// Host function to read matrix from file
|
|
void load_matrix(const char *filename, double *matrix) {
|
|
FILE *file = fopen(filename, "rb");
|
|
if (!file) {
|
|
perror("Cannot read file");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
fread(matrix, sizeof(double), N * N, file);
|
|
fclose(file);
|
|
}
|
|
|
|
// CUDA kernel: each thread computes one element of C
|
|
__global__ void matrixMultiply(const double *A, const double *B, double *C, int n) {
|
|
int row = blockIdx.y * blockDim.y + threadIdx.y; // Calculate matrix row index
|
|
int col = blockIdx.x * blockDim.x + threadIdx.x; // Calculate matrix column index
|
|
|
|
if (row < n && col < n) {
|
|
double sum = 0;
|
|
for (int k = 0; k < n; k++) {
|
|
sum += A[row * n + k] * B[k * n + col];
|
|
}
|
|
C[row * n + col] = sum;
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
char matrix_a_path[256], matrix_b_path[256];
|
|
sprintf(matrix_a_path, "../dataset/%d/matrix_A.bin", N);
|
|
sprintf(matrix_b_path, "../dataset/%d/matrix_B.bin", N);
|
|
|
|
double *h_A = (double *)malloc(N * N * sizeof(double));
|
|
double *h_B = (double *)malloc(N * N * sizeof(double));
|
|
double *h_C = (double *)malloc(N * N * sizeof(double));
|
|
|
|
if (!h_A || !h_B || !h_C) {
|
|
perror("Memory allocation failed");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Load matrices from files
|
|
load_matrix(matrix_a_path, h_A);
|
|
load_matrix(matrix_b_path, h_B);
|
|
|
|
// Allocate device memory
|
|
double *d_A, *d_B, *d_C;
|
|
cudaMalloc((void**)&d_A, N * N * sizeof(double));
|
|
cudaMalloc((void**)&d_B, N * N * sizeof(double));
|
|
cudaMalloc((void**)&d_C, N * N * sizeof(double));
|
|
|
|
// Copy data from host to device
|
|
cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_B, h_B, N * N * sizeof(double), cudaMemcpyHostToDevice);
|
|
|
|
// Define block and grid dimensions (using 16x16 blocks)
|
|
dim3 threadsPerBlock(16, 16);
|
|
dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
|
|
(N + threadsPerBlock.y - 1) / threadsPerBlock.y);
|
|
|
|
// Execute CUDA kernel
|
|
matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
|
|
cudaDeviceSynchronize();
|
|
|
|
// Copy result back from device to host
|
|
cudaMemcpy(h_C, d_C, N * N * sizeof(double), cudaMemcpyDeviceToHost);
|
|
|
|
// Free memory
|
|
cudaFree(d_A);
|
|
cudaFree(d_B);
|
|
cudaFree(d_C);
|
|
free(h_A);
|
|
free(h_B);
|
|
free(h_C);
|
|
|
|
return 0;
|
|
}
|