diff --git a/part_1/image.cu b/part_1/image.cu index c013135a6e609b517ee9ec6b225c61b02d3745c2..3b1c01d518fbab5feb1c32e5295ddb58bae55c4c 100644 --- a/part_1/image.cu +++ b/part_1/image.cu @@ -31,6 +31,137 @@ const Image::PixelStruct Image::pixelPink = Image::PixelStruct(1,0,1); const Image::PixelStruct Image::pixelLiBlue = Image::PixelStruct(0,1,1); +//region CUDA Methods. + +/** + * Displays various properties of all CUDA devices found on machine. + **/ +void display_cuda_device_properties() { + // Check available CUDA devices. + cudaDeviceProp cuda_device_properties; + int cuda_device_count; + + cudaGetDeviceCount(&cuda_device_count); + + for (int index = 0; index < cuda_device_count; index++) { + cudaGetDeviceProperties(&cuda_device_properties, index); + + printf("\n\n\n\n"); + printf("==== General Information for CUDA Device %d ====\n", index); + printf("Name: %s\n", cuda_device_properties.name); + printf("Compute Capabilities: %d.%d\n", cuda_device_properties.major, cuda_device_properties.minor); + printf("Clock Rate: %d\n", cuda_device_properties.clockRate); + printf("\n\n"); + printf("==== Memory Information for CUDA Device %d ====\n", index); + printf("Total Global Memory: %ld\n", cuda_device_properties.totalGlobalMem); + printf("Total Constant Memory: %ld\n", cuda_device_properties.totalConstMem); + printf("Max Memory Pitch: %ld\n", cuda_device_properties.memPitch); + printf("Texture Alignment: %ld\n", cuda_device_properties.textureAlignment); + printf("\n\n"); + printf("==== Multiprocessing Information for CUDA Device %d ====\n", index); + printf("Multiprocessor Count: %d\n", cuda_device_properties.multiProcessorCount); + printf("Shared Memory Per MP: %ld\n", cuda_device_properties.sharedMemPerBlock); + printf("Registers Per MP: %d\n", cuda_device_properties.regsPerBlock); + printf("Threads in Warp: %d\n", cuda_device_properties.warpSize); + printf("Max Threads Per Block: %d\n", cuda_device_properties.maxThreadsPerBlock); + printf("Max Thread Dimensions: (%d, %d, %d)\n", + cuda_device_properties.maxThreadsDim[0], + cuda_device_properties.maxThreadsDim[1], + cuda_device_properties.maxThreadsDim[2] + ); + printf("Max Grid Dimensions: (%d, %d, %d)\n", + cuda_device_properties.maxGridSize[0], + cuda_device_properties.maxGridSize[1], + cuda_device_properties.maxGridSize[2] + ); + printf("\n\n\n\n"); + } +} + +/** + * Updates image so that every other pixel alternates between pure black and pure white. + * Purely for testing purposes. + */ +// __global__ void cuda_pixel_alternating(Image::PixelStruct* pixel_arr, int pixel_index) { +__global__ void cuda_alternating_pixels(unsigned long total_pixels, unsigned long chunk_size, int chunk_counter, Image::PixelStruct* pixel_arr) { + // Calculate current index. + unsigned long pixel_index = blockIdx.x * blockDim.x + threadIdx.x; + + // Adjust index for current chunk. + pixel_index += (chunk_size * chunk_counter); + + // Only proceed if GPU thread corresponds to a pixel in the image. Ignore all extra threads. + if (pixel_index <= total_pixels) { + + // Manipulate pixel data based on index. + if (pixel_index % 2 == 0) { + // Set to black. + pixel_arr[pixel_index].r = 0.f; + pixel_arr[pixel_index].g = 0.f; + pixel_arr[pixel_index].b = 0.f; + } else { + // Set to white. + pixel_arr[pixel_index].r = 255.f; + pixel_arr[pixel_index].g = 255.f; + pixel_arr[pixel_index].b = 255.f; + } + } +} + + +/** + * Updates image so that blocks of pixels alternate between pure black and pure white. + * Effectively, it's similar to cuda_alternating_pixels, except that all image sizes should look roughly the same. + * Purely for testing purposes. + */ +// __global__ void cuda_pixel_alternating(Image::PixelStruct* pixel_arr, int pixel_index) { +__global__ void cuda_alternating_blocks(unsigned long total_pixels, unsigned long chunk_size, int chunk_counter, int pixel_block_size, int image_width, int image_height, Image::PixelStruct* pixel_arr) { + // Calculate current index. + unsigned long pixel_index = blockIdx.x * blockDim.x + threadIdx.x; + + // Adjust index for current chunk. + pixel_index += (chunk_size * chunk_counter); + + // Only proceed if GPU thread corresponds to a pixel in the image. Ignore all extra threads. + if (pixel_index <= total_pixels) { + + // Calculate if falls under black or white pixel block. + int row = pixel_index / image_width; + int col = pixel_index % image_width; + int row_grouping = row / pixel_block_size; + int col_grouping = col / pixel_block_size; + float color; + if (row_grouping % 2 == 0) { + if ((col_grouping % 2) == 0) { + color = 255.f; + } else { + color = 0.f; + } + } else { + if ((col_grouping % 2) == 0) { + color = 0.f; + } else { + + color = 255.f; + } + } + + // Set pixel color. + pixel_arr[pixel_index].r = color; + pixel_arr[pixel_index].g = color; + pixel_arr[pixel_index].b = color; + + // int color = row * col * 2; + // printf("Width: %d Height: %d Index: %ld Row: %d Col: %d Color: %d\n", image_width, image_height, pixel_index, row, col, color); + // pixel_arr[pixel_index].r = static_cast<float>(color) / 255.f; + // pixel_arr[pixel_index].g = static_cast<float>(color) / 255.f; + // pixel_arr[pixel_index].b = static_cast<float>(color) / 255.f; + } +} + +//endregion CUDA Methods. + + //region Constructors. /** @@ -293,4 +424,76 @@ void Image::save() { } } + +/** + * Method to actually do work on image. + */ +void Image::compute() { + logger.debug("Image::compute():"); + logger.debug(""); + + // Get required CUDA device data. + // For now, lazily gets first CUDA device and assumes it's the desired device to use. + // TODO: Handle multiple CUDA devices. + cudaDeviceProp cuda_device_properties; + cudaGetDeviceProperties(&cuda_device_properties, 0); + int cuda_device_max_blocks = cuda_device_properties.maxThreadsDim[0]; + int cuda_device_max_threads = cuda_device_properties.maxThreadsPerBlock; + unsigned long cuda_chunk_size = cuda_device_max_blocks * cuda_device_max_threads; + + logger.info("CUDA Max Blocks: " + std::to_string(cuda_device_max_blocks)); + logger.info("CUDA Max Threads: " + std::to_string(cuda_device_max_threads)); + logger.info("CUDA Chunk Size: " + std::to_string(cuda_chunk_size)); + logger.info("CUDA Global Mem: " + std::to_string(cuda_device_properties.totalGlobalMem)); + + // Calculate some image data. + int pixel_struct_size = sizeof(PixelStruct); + unsigned long total_pixels = width * height; + unsigned long struct_byte_count = total_pixels * pixel_struct_size; + + logger.info("Total Image Size: " + std::to_string(total_pixels)); + logger.info("Pixel Struct Size: " + std::to_string(struct_byte_count)); + + // We handle image in chunks, in case we're dealing with large images with more pixels than CUDA device blocks/threads. + int total_image_chunks = (total_pixels / cuda_chunk_size) + 1; + logger.info("Total Image Chunks: " + std::to_string(total_image_chunks)); + + // Calculate desired blocks and threads. + int cuda_device_desired_block_count = total_pixels / cuda_device_max_threads; + if (cuda_device_desired_block_count <= 0) { + cuda_device_desired_block_count = 1; + } + // TODO: Handle if desired block count exceeds device maximum count. + + // Process image by iterating over all chunks. + for (int chunk_index = 0; chunk_index < total_image_chunks; chunk_index++) { + logger.info("Iterating over image chunk " + std::to_string(chunk_index)); + logger.info("Accounts for pixels " + std::to_string(cuda_chunk_size * chunk_index) + " through " + std::to_string((cuda_chunk_size * chunk_index) + cuda_chunk_size) ); + + // Allocate memory on GPU. + PixelStruct* gpu_pixel_arr; + cudaMalloc((void**) &gpu_pixel_arr, struct_byte_count); + cudaMemcpy(gpu_pixel_arr, pixel_arr, struct_byte_count, cudaMemcpyHostToDevice); + + // Run GPU kernel logic. + // cuda_alternating_pixels<<<cuda_device_desired_block_count, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); + + int pixel_block_size = width / 10; + if (pixel_block_size <= 1) { + pixel_block_size = 2; + } + cuda_alternating_blocks<<<cuda_device_desired_block_count, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, pixel_block_size, width, height, gpu_pixel_arr); + + // Synchronize all GPU computations before iterating over next chunk. + cudaDeviceSynchronize(); + + // Retrieve results. + cudaMemcpy(pixel_arr, gpu_pixel_arr, struct_byte_count, cudaMemcpyDeviceToHost); + cudaFree(gpu_pixel_arr); + } + + logger.debug("Image::compute() finished."); + logger.debug(""); +} + //endregion Public Methods. diff --git a/part_1/image.h b/part_1/image.h index 65b5ba10724a12f51a51a77ae0969d23e2738f19..df24cba921ba7cfd0f4feffa3ff66552b6001e39 100644 --- a/part_1/image.h +++ b/part_1/image.h @@ -21,7 +21,7 @@ */ class Image { - private: + public: /** * Struct to act as a "container" of sorts for each individual rgb pixel. * An array of these creates our entire image. @@ -46,6 +46,7 @@ class Image { }; + private: // Private Variables. std::string input_path; std::string output_path; @@ -107,6 +108,12 @@ class Image { */ void save(); + + /** + * Method to actually do work on image. + */ + void compute(); + //endregion Public Methods. }; diff --git a/part_1/main.cu b/part_1/main.cu index 3fda4e5337e3c279c13fe1ab237492049c92ea49..d14d0e0a231809cee7706567eaa332819e6aefd5 100644 --- a/part_1/main.cu +++ b/part_1/main.cu @@ -161,8 +161,12 @@ void process_file(std::string path_str) { // Track time to process. time_t start_time = time(0); + // Import image data to class. Image image(path_str); image.display_properties(); + + // Manipulate and save image data. + image.compute(); image.save(); // Calculate time processing file.