From e48907084e310ee5284f195f2c3fe4de4f0965b7 Mon Sep 17 00:00:00 2001 From: Yorukot Date: Fri, 28 Feb 2025 14:41:26 +0800 Subject: [PATCH] feat: update readme --- README.md | 183 +++++++++++++++++++++++++++ matrix multiplication/OpenMP/main.cu | 49 ------- sort algorithm/README.md | 1 - 3 files changed, 183 insertions(+), 50 deletions(-) create mode 100644 README.md delete mode 100644 matrix multiplication/OpenMP/main.cu delete mode 100644 sort algorithm/README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..1454a3e --- /dev/null +++ b/README.md @@ -0,0 +1,183 @@ +# High-Performance Computing Repository + +This repository contains implementations of parallel sorting algorithms and matrix multiplication optimizations using various high-performance computing techniques, including CUDA and OpenMP. + +## Repository Structure + +``` +. +├── sort algorithm/ # Sorting algorithm implementations +│ ├── bitonic Sort/ # Bitonic sort implementation +│ │ └── CUDA/ # CUDA-based bitonic sort +│ ├── quicksort/ # Quicksort implementations +│ │ └── OpenMP/ # OpenMP-based quicksort +│ ├── dataset/ # Datasets with different sizes +│ ├── generate_data # Executable for data generation +│ ├── generate_data.c # Source for dataset generation +│ └── Makefile # Build configuration +│ +└── matrix multiplication/ # Matrix multiplication implementations + ├── CUDA/ # CUDA-based matrix multiplication + ├── OpenMP/ # OpenMP-based matrix multiplication + ├── dataset/ # Matrix datasets + ├── generate_matrix # Executable for matrix generation + ├── generate_matrix.c # Source for matrix generation + └── makefile # Build configuration +``` + +## Sorting Algorithms + +The repository implements two different sorting algorithms optimized for parallel execution: + +### Bitonic Sort (CUDA) + +Bitonic sort is a comparison-based sorting algorithm that is well-suited for parallel implementation. The CUDA implementation uses GPU parallelism to achieve high performance, especially for large datasets. + +Key features: +- GPU-accelerated implementation using CUDA +- Efficient for large datasets (up to hundreds of millions of elements) +- In-place sorting to minimize memory overhead +- Scales well with available GPU resources + +### Quicksort (OpenMP) + +The repository includes a parallel implementation of the classic quicksort algorithm using OpenMP. + +Key features: +- CPU-based parallelism using OpenMP tasks +- Hybrid approach with sequential sorting for small partitions +- Configurable threshold (THRESHOLD) to switch between parallel and sequential execution +- Good performance scaling with multiple CPU cores + +### Sorting Dataset Generation + +The repository includes a data generator (`generate_data.c`) that creates datasets of various sizes for testing the sorting algorithms. The generated datasets consist of random floating-point numbers. + +Features: +- Supports various dataset sizes (powers of 2) +- Creates directory structure for organized data management +- Generates reproducible random data + +Available dataset sizes (number of elements): +- 1024 (1K) +- 32768 (32K) +- 1048576 (1M) +- 16777216 (16M) +- 67108864 (64M) +- 134217728 (128M) +- 268435456 (256M) +- 402653184 (384M) + +## Matrix Multiplication + +The repository provides optimized implementations of matrix multiplication using parallel computing techniques: + +### CUDA Matrix Multiplication + +The CUDA implementation leverages GPU cores to perform matrix multiplication in parallel. + +Key features: +- 2D thread organization for efficient computation +- Each thread computes one element of the result matrix +- Uses 16x16 thread blocks for optimal performance +- Handles large matrix sizes (tested with 8192x8192) + +### OpenMP Matrix Multiplication + +The OpenMP implementation uses multiple CPU cores to parallelize the matrix multiplication operation. + +Key features: +- Configurable number of threads (NUM_THREADS) +- Uses OpenMP directives to parallelize the outermost loop +- Simple and effective parallelization approach +- Support for different matrix sizes + +### Matrix Data Generation + +The repository includes a tool (`generate_matrix.c`) for creating random matrices for testing purposes. + +Features: +- Generates two random matrices (A and B) +- Supports different matrix sizes (configurable via the N macro) +- Saves matrices in binary format for efficient loading + +## Building and Running + +### Sorting Algorithms + +1. Generate the dataset: + ``` + cd "sort algorithm" + make generate_data + ./generate_data [size] + ``` + Where `[size]` is a power of 2 (e.g., 1024, 32768, etc.) + +2. Build and run the CUDA bitonic sort: + ``` + cd "sort algorithm/bitonic Sort/CUDA" + make + ./bitonic_sort + ``` + +3. Build and run the OpenMP quicksort: + ``` + cd "sort algorithm/quicksort/OpenMP" + make + ./quicksort + ``` + +### Matrix Multiplication + +1. Generate the matrices: + ``` + cd "matrix multiplication" + make -f makefile + ./generate_matrix + ``` + +2. Build and run the CUDA matrix multiplication: + ``` + cd "matrix multiplication/CUDA" + make -f makefile + ./matrix_mul + ``` + +3. Build and run the OpenMP matrix multiplication: + ``` + cd "matrix multiplication/OpenMP" + make + ./matrix_mul + ``` + +## Performance Considerations + +### Sorting Performance + +- The CUDA bitonic sort performs best for very large datasets where the GPU parallelism can be fully utilized. +- The OpenMP quicksort performs well on multi-core CPUs and is more efficient for smaller datasets. +- The THRESHOLD parameter in the OpenMP quicksort can be tuned based on the hardware to optimize performance. + +### Matrix Multiplication Performance + +- The CUDA implementation performs best for large matrices where the GPU parallelism can be fully utilized. +- The OpenMP implementation works well on multi-core CPUs but may not scale as well as the GPU version for very large matrices. +- The block size in the CUDA implementation (16x16) is chosen for good performance on most GPUs but can be tuned for specific hardware. + +## Requirements + +### Hardware +- For CUDA implementations: NVIDIA GPU with CUDA support +- For OpenMP implementations: Multi-core CPU + +### Software +- CUDA Toolkit (for CUDA implementations) +- GCC with OpenMP support (for OpenMP implementations) +- Make + +## Notes + +- Matrix sizes and array lengths are generally defined as constants in the source files. Modify these constants to experiment with different sizes. +- Both implementations read input data from files in the dataset directory. +- The CUDA implementations require a compatible NVIDIA GPU. +- The OpenMP implementations will work on any multi-core CPU. \ No newline at end of file diff --git a/matrix multiplication/OpenMP/main.cu b/matrix multiplication/OpenMP/main.cu deleted file mode 100644 index e20d89f..0000000 --- a/matrix multiplication/OpenMP/main.cu +++ /dev/null @@ -1,49 +0,0 @@ -#include -#include -#include - -#define N 1024 -#define NUM_THREADS 8 // Set desired number of threads - -void load_matrix(const char *filename, double *matrix) { - FILE *file = fopen(filename, "rb"); - if (!file) { - perror("無法讀取檔案"); - exit(EXIT_FAILURE); - } - fread(matrix, sizeof(double), N * N, file); - fclose(file); -} - -int main() { - char matrix_a_path[256], matrix_b_path[256]; - sprintf(matrix_a_path, "../dataset/%d/matrix_A.bin", N); - sprintf(matrix_b_path, "../dataset/%d/matrix_B.bin", N); - - double *A = (double *)malloc(N * N * sizeof(double)); - double *B = (double *)malloc(N * N * sizeof(double)); - double *C = (double *)calloc(N * N, sizeof(double)); - - if (!A || !B || !C) { - perror("記憶體配置失敗"); - exit(EXIT_FAILURE); - } - - load_matrix(matrix_a_path, A); - load_matrix(matrix_b_path, B); - - omp_set_num_threads(NUM_THREADS); // Set thread limit - #pragma omp parallel for - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - C[i * N + j] += A[i * N + k] * B[k * N + j]; - } - } - } - - free(A); - free(B); - free(C); - return 0; -} diff --git a/sort algorithm/README.md b/sort algorithm/README.md deleted file mode 100644 index 72eccfe..0000000 --- a/sort algorithm/README.md +++ /dev/null @@ -1 +0,0 @@ -1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216 \ No newline at end of file