feat: update hcp testing

2025-02-19 18:00:28 +08:00 · 2025-02-19 18:00:28 +08:00 · 54468c2db7
commit 54468c2db7
7 changed files with 202 additions and 0 deletions
--- a/multiplication/CUDA/matrix_mul.cu
+++ b/multiplication/CUDA/matrix_mul.cu
@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#define N 4096
+#define FILE_A "matrix_A.bin"
+#define FILE_B "matrix_B.bin"
+
+// 主機端讀取矩陣檔案
+void load_matrix(const char *filename, double *matrix) {
+    FILE *file = fopen(filename, "rb");
+    if (!file) {
+        perror("無法讀取檔案");
+        exit(EXIT_FAILURE);
+    }
+    fread(matrix, sizeof(double), N * N, file);
+    fclose(file);
+}
+
+// CUDA 核函數：每個執行緒計算 C 中一個元素
+__global__ void matrixMultiply(const double *A, const double *B, double *C, int n) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y; // 計算矩陣行索引
+    int col = blockIdx.x * blockDim.x + threadIdx.x; // 計算矩陣列索引
+
+    if (row < n && col < n) {
+        double sum = 0;
+        for (int k = 0; k < n; k++) {
+            sum += A[row * n + k] * B[k * n + col];
+        }
+        C[row * n + col] = sum;
+    }
+}
+
+int main() {
+    double *h_A = (double *)malloc(N * N * sizeof(double));
+    double *h_B = (double *)malloc(N * N * sizeof(double));
+    double *h_C = (double *)malloc(N * N * sizeof(double));
+
+    if (!h_A || !h_B || !h_C) {
+        perror("記憶體配置失敗");
+        exit(EXIT_FAILURE);
+    }
+
+    // 從檔案載入矩陣
+    load_matrix(FILE_A, h_A);
+    load_matrix(FILE_B, h_B);
+
+    // 配置裝置記憶體
+    double *d_A, *d_B, *d_C;
+    cudaMalloc((void**)&d_A, N * N * sizeof(double));
+    cudaMalloc((void**)&d_B, N * N * sizeof(double));
+    cudaMalloc((void**)&d_C, N * N * sizeof(double));
+
+    // 將主機資料複製到裝置
+    cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, h_B, N * N * sizeof(double), cudaMemcpyHostToDevice);
+
+    // 定義區塊與網格大小 (這裡選擇 16x16 的區塊)
+    dim3 threadsPerBlock(16, 16);
+    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                       (N + threadsPerBlock.y - 1) / threadsPerBlock.y);
+
+    // 使用 cudaEvent 計時
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+
+    // 執行 CUDA 核函數
+    matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
+    cudaDeviceSynchronize();
+
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    printf("矩陣乘法完成，花費時間: %f 秒\n", milliseconds / 1000.0);
+
+    // 將結果從裝置複製回主機
+    cudaMemcpy(h_C, d_C, N * N * sizeof(double), cudaMemcpyDeviceToHost);
+
+    // 釋放記憶體
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    return 0;
+}
--- a/multiplication/OpenMP/main.c
+++ b/multiplication/OpenMP/main.c
@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define N 4096
+#define FILE_A "matrix_A.bin"
+#define FILE_B "matrix_B.bin"
+
+void load_matrix(const char *filename, double *matrix) {
+    FILE *file = fopen(filename, "rb");
+    if (!file) {
+        perror("無法讀取檔案");
+        exit(EXIT_FAILURE);
+    }
+    fread(matrix, sizeof(double), N * N, file);
+    fclose(file);
+}
+
+int main() {
+    double *A = (double *)malloc(N * N * sizeof(double));
+    double *B = (double *)malloc(N * N * sizeof(double));
+    double *C = (double *)calloc(N * N, sizeof(double));
+
+    if (!A || !B || !C) {
+        perror("記憶體配置失敗");
+        exit(EXIT_FAILURE);
+    }
+
+    load_matrix(FILE_A, A);
+    load_matrix(FILE_B, B);
+
+    double start = omp_get_wtime();
+
+    #pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+                C[i * N + j] += A[i * N + k] * B[k * N + j];
+            }
+        }
+    }
+
+    double end = omp_get_wtime();
+    printf("矩陣乘法完成，花費時間: %f 秒\n", end - start);
+
+    free(A);
+    free(B);
+    free(C);
+    return 0;
+}
--- a/multiplication/OpenMP/makefile
+++ b/multiplication/OpenMP/makefile
@ -0,0 +1,12 @@
+CC = gcc
+CFLAGS = -fopenmp -O2
+TARGET = matrix_mul
+SRC = main.c
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(CC) $(CFLAGS) -o $(TARGET) $(SRC)
+
+clean:
+	rm -f $(TARGET)
--- a/multiplication/OpenMP/matrix_mul
+++ b/multiplication/OpenMP/matrix_mul
--- a/multiplication/generate_matrix
+++ b/multiplication/generate_matrix
--- a/multiplication/generate_matrix.c
+++ b/multiplication/generate_matrix.c
@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 8192
+#define FILE_A "matrix_A.bin"
+#define FILE_B "matrix_B.bin"
+
+void generate_and_save_matrix(const char *filename) {
+    FILE *file = fopen(filename, "wb");
+    if (!file) {
+        perror("無法打開檔案");
+        exit(EXIT_FAILURE);
+    }
+
+    double *row = (double *)malloc(N * sizeof(double));
+    if (!row) {
+        perror("記憶體配置失敗");
+        exit(EXIT_FAILURE);
+    }
+
+    for (size_t i = 0; i < N; i++) {
+        for (size_t j = 0; j < N; j++) {
+            row[j] = (double)(rand() % 100) / 10.0;
+        }
+        fwrite(row, sizeof(double), N, file);
+    }
+
+    fclose(file);
+    free(row);
+}
+
+int main() {
+    generate_and_save_matrix(FILE_A);
+    generate_and_save_matrix(FILE_B);
+    printf("矩陣 A 和 B 已成功產生並存入 %s 和 %s\n", FILE_A, FILE_B);
+    return 0;
+}
--- a/multiplication/makefile
+++ b/multiplication/makefile
@ -0,0 +1,12 @@
+CC = gcc
+CFLAGS = -fopenmp -O2
+TARGET = generate_matrix
+SRC = generate_matrix.c
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(CC) $(CFLAGS) -o $(TARGET) $(SRC)
+
+clean:
+	rm -f $(TARGET)