#include #include #include #define N 4096 #define FILE_A "matrix_A.bin" #define FILE_B "matrix_B.bin" // 主機端讀取矩陣檔案 void load_matrix(const char *filename, double *matrix) { FILE *file = fopen(filename, "rb"); if (!file) { perror("無法讀取檔案"); exit(EXIT_FAILURE); } fread(matrix, sizeof(double), N * N, file); fclose(file); } // CUDA 核函數:每個執行緒計算 C 中一個元素 __global__ void matrixMultiply(const double *A, const double *B, double *C, int n) { int row = blockIdx.y * blockDim.y + threadIdx.y; // 計算矩陣行索引 int col = blockIdx.x * blockDim.x + threadIdx.x; // 計算矩陣列索引 if (row < n && col < n) { double sum = 0; for (int k = 0; k < n; k++) { sum += A[row * n + k] * B[k * n + col]; } C[row * n + col] = sum; } } int main() { double *h_A = (double *)malloc(N * N * sizeof(double)); double *h_B = (double *)malloc(N * N * sizeof(double)); double *h_C = (double *)malloc(N * N * sizeof(double)); if (!h_A || !h_B || !h_C) { perror("記憶體配置失敗"); exit(EXIT_FAILURE); } // 從檔案載入矩陣 load_matrix(FILE_A, h_A); load_matrix(FILE_B, h_B); // 配置裝置記憶體 double *d_A, *d_B, *d_C; cudaMalloc((void**)&d_A, N * N * sizeof(double)); cudaMalloc((void**)&d_B, N * N * sizeof(double)); cudaMalloc((void**)&d_C, N * N * sizeof(double)); // 將主機資料複製到裝置 cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, N * N * sizeof(double), cudaMemcpyHostToDevice); // 定義區塊與網格大小 (這裡選擇 16x16 的區塊) dim3 threadsPerBlock(16, 16); dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y); // 使用 cudaEvent 計時 cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); // 執行 CUDA 核函數 matrixMultiply<<>>(d_A, d_B, d_C, N); cudaDeviceSynchronize(); cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); printf("矩陣乘法完成,花費時間: %f 秒\n", milliseconds / 1000.0); // 將結果從裝置複製回主機 cudaMemcpy(h_C, d_C, N * N * sizeof(double), cudaMemcpyDeviceToHost); // 釋放記憶體 cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); return 0; }