commit 54468c2db733877021ad997eb2fa30b2bd7c3779 Author: Yorukot Date: Wed Feb 19 18:00:28 2025 +0800 feat: update hcp testing diff --git a/matrix multiplication/CUDA/matrix_mul.cu b/matrix multiplication/CUDA/matrix_mul.cu new file mode 100644 index 0000000..6422da2 --- /dev/null +++ b/matrix multiplication/CUDA/matrix_mul.cu @@ -0,0 +1,91 @@ +#include +#include +#include + +#define N 4096 +#define FILE_A "matrix_A.bin" +#define FILE_B "matrix_B.bin" + +// 主機端讀取矩陣檔案 +void load_matrix(const char *filename, double *matrix) { + FILE *file = fopen(filename, "rb"); + if (!file) { + perror("無法讀取檔案"); + exit(EXIT_FAILURE); + } + fread(matrix, sizeof(double), N * N, file); + fclose(file); +} + +// CUDA 核函數:每個執行緒計算 C 中一個元素 +__global__ void matrixMultiply(const double *A, const double *B, double *C, int n) { + int row = blockIdx.y * blockDim.y + threadIdx.y; // 計算矩陣行索引 + int col = blockIdx.x * blockDim.x + threadIdx.x; // 計算矩陣列索引 + + if (row < n && col < n) { + double sum = 0; + for (int k = 0; k < n; k++) { + sum += A[row * n + k] * B[k * n + col]; + } + C[row * n + col] = sum; + } +} + +int main() { + double *h_A = (double *)malloc(N * N * sizeof(double)); + double *h_B = (double *)malloc(N * N * sizeof(double)); + double *h_C = (double *)malloc(N * N * sizeof(double)); + + if (!h_A || !h_B || !h_C) { + perror("記憶體配置失敗"); + exit(EXIT_FAILURE); + } + + // 從檔案載入矩陣 + load_matrix(FILE_A, h_A); + load_matrix(FILE_B, h_B); + + // 配置裝置記憶體 + double *d_A, *d_B, *d_C; + cudaMalloc((void**)&d_A, N * N * sizeof(double)); + cudaMalloc((void**)&d_B, N * N * sizeof(double)); + cudaMalloc((void**)&d_C, N * N * sizeof(double)); + + // 將主機資料複製到裝置 + cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, N * N * sizeof(double), cudaMemcpyHostToDevice); + + // 定義區塊與網格大小 (這裡選擇 16x16 的區塊) + dim3 threadsPerBlock(16, 16); + dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x, + (N + threadsPerBlock.y - 1) / threadsPerBlock.y); + + // 使用 cudaEvent 計時 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + // 執行 CUDA 核函數 + matrixMultiply<<>>(d_A, d_B, d_C, N); + cudaDeviceSynchronize(); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("矩陣乘法完成,花費時間: %f 秒\n", milliseconds / 1000.0); + + // 將結果從裝置複製回主機 + cudaMemcpy(h_C, d_C, N * N * sizeof(double), cudaMemcpyDeviceToHost); + + // 釋放記憶體 + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + free(h_A); + free(h_B); + free(h_C); + + return 0; +} \ No newline at end of file diff --git a/matrix multiplication/OpenMP/main.c b/matrix multiplication/OpenMP/main.c new file mode 100644 index 0000000..4d1f517 --- /dev/null +++ b/matrix multiplication/OpenMP/main.c @@ -0,0 +1,50 @@ +#include +#include +#include + +#define N 4096 +#define FILE_A "matrix_A.bin" +#define FILE_B "matrix_B.bin" + +void load_matrix(const char *filename, double *matrix) { + FILE *file = fopen(filename, "rb"); + if (!file) { + perror("無法讀取檔案"); + exit(EXIT_FAILURE); + } + fread(matrix, sizeof(double), N * N, file); + fclose(file); +} + +int main() { + double *A = (double *)malloc(N * N * sizeof(double)); + double *B = (double *)malloc(N * N * sizeof(double)); + double *C = (double *)calloc(N * N, sizeof(double)); + + if (!A || !B || !C) { + perror("記憶體配置失敗"); + exit(EXIT_FAILURE); + } + + load_matrix(FILE_A, A); + load_matrix(FILE_B, B); + + double start = omp_get_wtime(); + + #pragma omp parallel for + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i * N + j] += A[i * N + k] * B[k * N + j]; + } + } + } + + double end = omp_get_wtime(); + printf("矩陣乘法完成,花費時間: %f 秒\n", end - start); + + free(A); + free(B); + free(C); + return 0; +} diff --git a/matrix multiplication/OpenMP/makefile b/matrix multiplication/OpenMP/makefile new file mode 100644 index 0000000..d16dec4 --- /dev/null +++ b/matrix multiplication/OpenMP/makefile @@ -0,0 +1,12 @@ +CC = gcc +CFLAGS = -fopenmp -O2 +TARGET = matrix_mul +SRC = main.c + +all: $(TARGET) + +$(TARGET): $(SRC) + $(CC) $(CFLAGS) -o $(TARGET) $(SRC) + +clean: + rm -f $(TARGET) \ No newline at end of file diff --git a/matrix multiplication/OpenMP/matrix_mul b/matrix multiplication/OpenMP/matrix_mul new file mode 100755 index 0000000..602ea59 Binary files /dev/null and b/matrix multiplication/OpenMP/matrix_mul differ diff --git a/matrix multiplication/generate_matrix b/matrix multiplication/generate_matrix new file mode 100755 index 0000000..7691057 Binary files /dev/null and b/matrix multiplication/generate_matrix differ diff --git a/matrix multiplication/generate_matrix.c b/matrix multiplication/generate_matrix.c new file mode 100644 index 0000000..e6a3544 --- /dev/null +++ b/matrix multiplication/generate_matrix.c @@ -0,0 +1,37 @@ +#include +#include + +#define N 8192 +#define FILE_A "matrix_A.bin" +#define FILE_B "matrix_B.bin" + +void generate_and_save_matrix(const char *filename) { + FILE *file = fopen(filename, "wb"); + if (!file) { + perror("無法打開檔案"); + exit(EXIT_FAILURE); + } + + double *row = (double *)malloc(N * sizeof(double)); + if (!row) { + perror("記憶體配置失敗"); + exit(EXIT_FAILURE); + } + + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < N; j++) { + row[j] = (double)(rand() % 100) / 10.0; + } + fwrite(row, sizeof(double), N, file); + } + + fclose(file); + free(row); +} + +int main() { + generate_and_save_matrix(FILE_A); + generate_and_save_matrix(FILE_B); + printf("矩陣 A 和 B 已成功產生並存入 %s 和 %s\n", FILE_A, FILE_B); + return 0; +} \ No newline at end of file diff --git a/matrix multiplication/makefile b/matrix multiplication/makefile new file mode 100644 index 0000000..ed22bd3 --- /dev/null +++ b/matrix multiplication/makefile @@ -0,0 +1,12 @@ +CC = gcc +CFLAGS = -fopenmp -O2 +TARGET = generate_matrix +SRC = generate_matrix.c + +all: $(TARGET) + +$(TARGET): $(SRC) + $(CC) $(CFLAGS) -o $(TARGET) $(SRC) + +clean: + rm -f $(TARGET) \ No newline at end of file