diff --git a/part_1/image.cu b/part_1/image.cu index 87afa26c8f26bbef367407422e15420a415fa4d7..9f0eb580c0a6fcaf2cf5106e5855d5bd20a4c68e 100644 --- a/part_1/image.cu +++ b/part_1/image.cu @@ -273,6 +273,148 @@ __global__ void cuda_increase_blue(unsigned long total_pixels, unsigned long chu } } + +/** + * Uses convolution to blur image. + */ +__global__ void cuda_convolution_blur( + unsigned long total_pixels, unsigned long chunk_size, + int chunk_counter, int image_width, int image_height, int mask_width, int mask_height, + Image::PixelStruct* pixel_arr_orig, Image::PixelStruct* pixel_arr, Image::PixelStruct* mask_arr +) { + // Calculate current index. + unsigned long pixel_index = blockIdx.x * blockDim.x + threadIdx.x; + + // Adjust index for current chunk. + pixel_index += (chunk_size * chunk_counter); + + // Only proceed if GPU thread corresponds to a pixel in the image. Ignore all extra threads. + if (pixel_index <= total_pixels) { + + // Calculate pixel location data. + int image_row = (pixel_index / image_width) % image_height; + int image_col = pixel_index % image_width; + int image_mid_row = image_height / 2; + int image_mid_col = image_width / 2; + + // Calculate mask location data. + int mask_mid_row = mask_height / 2; + int mask_mid_col = mask_width / 2; + + // Calculate pixel locations based on mask values. + int col_far_left = image_mid_col - mask_mid_col; + int col_far_right = image_mid_col + mask_mid_col; + int row_far_top = image_mid_row - mask_mid_row; + int row_far_bot = image_mid_row + mask_mid_row; + + // For now, only compute on pixel in center of image. + if ((image_row == image_mid_row) && (image_col == image_mid_col)) { + // printf("\n\n"); + // printf("total_pixels: %ld image_width: %d image_height: %d\n", total_pixels, image_width, image_height); + // printf("Middle pixel found: %ld image_col: %d image_row: %d\n", pixel_index, image_col, image_row); + // printf("image_middle_col: %d image_middle_row: %d mask_middle_col: %d mask_middle_row: %d\n", image_mid_col, image_mid_row, mask_mid_col, mask_mid_row); + + // printf("\n"); + // printf("Associated mask indexes:\n"); + // printf("Center: [%d,%d]\n", image_mid_col, image_mid_row); + // printf("CenterLeft: [%d,%d]\n", col_far_left, image_mid_row); + // printf("CenterRight: [%d,%d]\n", col_far_right, image_mid_row); + // printf("CenterTop: [%d,%d]\n", image_mid_col, row_far_top); + // printf("CenterBottom: [%d,%d]\n", image_mid_col, row_far_bot); + // printf("\n\n"); + + // // Display grid of mask values. + // for (int row_index = row_far_top; row_index <= row_far_bot; row_index++) { + // for (int col_index = col_far_left; col_index <= col_far_right; col_index++) { + // printf(" [%d,%d] ", col_index, row_index); + // } + // printf("\n"); + // } + // printf("\n\n"); + + // // Display grid of mask values. + // for (int row_index = 0; row_index < image_height; row_index++) { + // for (int col_index = 0; col_index < image_width; col_index++) { + // int a = col_index; + // int b = row_index; + // int c = a + (b * image_width); + // printf(" [%d,%d] (%d) ", a, b, c); + // } + // printf("\n"); + // } + // printf("\n\n"); + + // // Display grid of mask values. + // for (int row_index = row_far_top; row_index <= row_far_bot; row_index++) { + // for (int col_index = col_far_left; col_index <= col_far_right; col_index++) { + // int a = col_index; + // int b = row_index; + // int c = a + (b * image_width); + // printf(" [%d,%d] (%d) ", a, b, c); + // } + // printf("\n"); + // } + // printf("\n\n"); + + // pixel_arr[pixel_index].r = 1.f; + // pixel_arr[pixel_index].g = 0.f; + // pixel_arr[pixel_index].b = 0.f; + + // Overlay mask onto current pixel, and iterate through all associated pixels. + float convolution_total_r = 0.f; + float convolution_total_g = 0.f; + float convolution_total_b = 0.f; + int convolution_counter = 0; + int mask_index = 0; + for (int row_index = row_far_top; row_index <= row_far_bot; row_index++) { + for (int col_index = col_far_left; col_index <= col_far_right; col_index++) { + + // Double check that mask index is within image bounds. Ignore otherwise. + if ( ( (col_index >= 0) && (col_index < image_width) ) && ( (row_index >= 0) && (row_index < image_height) ) ) { + int arr_index = col_index + (row_index * image_width); + + // int a = col_index; + // int b = row_index; + // int c = a + (b * image_width); + // printf(" [%d,%d] (%d) (%f,%f,%f)", a, b, c, pixel_arr[mask_index].r, pixel_arr[mask_index].g, pixel_arr[mask_index].b); + + convolution_total_r += pixel_arr_orig[arr_index].r * mask_arr[mask_index].r; + convolution_total_g += pixel_arr_orig[arr_index].g * mask_arr[mask_index].g; + convolution_total_b += pixel_arr_orig[arr_index].b * mask_arr[mask_index].b; + convolution_counter += 1; + + // float red = pixel_arr_orig[arr_index].r * mask_arr[mask_index].r; + // float green = pixel_arr_orig[arr_index].g * mask_arr[mask_index].g; + // float blue = pixel_arr_orig[arr_index].b * mask_arr[mask_index].b; + // pixel_arr[arr_index].r = red; + // pixel_arr[arr_index].g = green; + // pixel_arr[arr_index].b = blue; + } + } + mask_index += 1; + // printf("\n"); + } + + // Average out values and apply to pixel index. + if (convolution_total_r > 0.f) { + pixel_arr[pixel_index].r = (convolution_total_r / convolution_counter); + } else { + pixel_arr[pixel_index].r = 0.f; + } + if (convolution_total_g > 0.f) { + pixel_arr[pixel_index].g = (convolution_total_g / convolution_counter); + } else { + pixel_arr[pixel_index].g = 0.f; + } + if (convolution_total_b > 0.f) { + pixel_arr[pixel_index].b = (convolution_total_b / convolution_counter); + } else { + pixel_arr[pixel_index].b = 0.f; + } + } + } +} + //endregion CUDA Methods. @@ -444,7 +586,7 @@ void Image::import_image_file() { // Binary RBG format. // Read each pixel one by one and convert RGB channel bytes to floats. unsigned char pixel[3]; - for (int index = 0; index < width * height; ++index) { + for (int index = 0; index < (width * height); ++index) { input_file.read(reinterpret_cast<char *>(pixel), 3); pixel_arr[index].r = pixel[0] / 255.f; pixel_arr[index].g = pixel[1] / 255.f; @@ -455,7 +597,7 @@ void Image::import_image_file() { // Read each pixel one by one and convert bytes to floats. unsigned char* pixel; pixel = (unsigned char*)calloc(1, sizeof(unsigned char)); - for (int index = 0; index < width * height; ++index) { + for (int index = 0; index < (width * height); ++index) { input_file.read(reinterpret_cast<char *>(pixel), 1); pixel_arr[index].r = *pixel / 255.f; pixel_arr[index].g = *pixel / 255.f; @@ -562,49 +704,112 @@ void Image::compute() { // Calculate some image data. int pixel_struct_size = sizeof(PixelStruct); - unsigned long total_pixels = width * height; - unsigned long struct_byte_count = total_pixels * pixel_struct_size; + unsigned long image_total_pixels = width * height; + unsigned long pixel_struct_byte_count = image_total_pixels * pixel_struct_size; - logger.info("Total Image Size: " + std::to_string(total_pixels)); - logger.info("Pixel Struct Size: " + std::to_string(struct_byte_count)); + logger.info("Total Image Size: " + std::to_string(image_total_pixels)); + logger.info("Pixel Struct Size: " + std::to_string(pixel_struct_byte_count)); // We handle image in chunks, in case we're dealing with large images with more pixels than CUDA device blocks/threads. - int total_image_chunks = (total_pixels / cuda_chunk_size) + 1; + int total_image_chunks = (image_total_pixels / cuda_chunk_size) + 1; logger.info("Total Image Chunks: " + std::to_string(total_image_chunks)); logger.info("Desired CUDA block count: " + std::to_string(cuda_device_max_blocks)); logger.info("Desired CUDA thread count: " + std::to_string(cuda_device_max_threads)); + // Generate convolution mask dimensions. Should be either 10% of image size or 3 pixels large, whichever is greater. + int mask_width = width / 10; + int mask_height = height / 10; + if ((mask_width % 2) == 0) { + mask_width -= 1; + } + if ((mask_height % 2) == 0) { + mask_height -= 1; + } + if (mask_width < 3) { + mask_width = 3; + } + if (mask_height < 3) { + mask_height = 3; + } + int mask_middle_col = mask_width / 2; + int mask_middle_row = mask_height / 2; + + // Calculate additional mask initialization values. + // Found via much struggling and trial + error. + float mask_col_adj = 1.f / (mask_middle_col + 1); + float mask_row_adj = 1.f / (mask_middle_row + 1); + printf("mask_middle_col: %d mask_middle_row: %d\n", mask_middle_col, mask_middle_row); + printf("mask_col_adj: %f mask_row_adj: %f\n", mask_col_adj, mask_row_adj); + unsigned long mask_total_pixels = mask_width * mask_height; + unsigned long mask_struct_byte_count = mask_total_pixels * pixel_struct_size; + + // Initialize mask and set weights. + PixelStruct* mask_arr = new PixelStruct[mask_total_pixels]; + for (int mask_row_index = 0; mask_row_index < mask_height; mask_row_index++) { + for (int mask_col_index = 0; mask_col_index < mask_width; mask_col_index++) { + // Get current overall mask index, based on row and col values. + int mask_index = mask_col_index + (mask_row_index * mask_width); + + // Calculate individual weights based on current x and y axis location of mask index, using distance from center point. + float mask_col_weight = (fabs(mask_middle_col - fabs(mask_middle_col - mask_col_index)) + 1) * mask_col_adj; + float mask_row_weight = (fabs(mask_middle_row - fabs(mask_middle_row - mask_row_index)) + 1) * mask_row_adj; + + // Calculate overall index weight, based on combination of individual dimension weights. + float index_weight = mask_col_weight * mask_row_weight; + + // printf(" [%d, %d] (%d) (%f,%f) ", mask_col_index, mask_row_index, mask_index, mask_col_weight, mask_row_weight); + printf(" [%d, %d] (%d) (%f) ", mask_col_index, mask_row_index, mask_index, index_weight); + + // Finally, set mask index values. + mask_arr[mask_index].r = index_weight; + mask_arr[mask_index].g = index_weight; + mask_arr[mask_index].b = index_weight; + } + printf("\n"); + } + + // Allocate memory on GPU. + PixelStruct* gpu_pixel_arr; + PixelStruct* gpu_pixel_arr_orig; + PixelStruct* gpu_mask_arr; + cudaMalloc((void**) &gpu_pixel_arr, pixel_struct_byte_count); + cudaMalloc((void**) &gpu_pixel_arr_orig, pixel_struct_byte_count); + cudaMalloc((void**) &gpu_mask_arr, mask_struct_byte_count); + cudaMemcpy(gpu_pixel_arr, pixel_arr, pixel_struct_byte_count, cudaMemcpyHostToDevice); + cudaMemcpy(gpu_pixel_arr_orig, pixel_arr, pixel_struct_byte_count, cudaMemcpyHostToDevice); + cudaMemcpy(gpu_mask_arr, mask_arr, mask_struct_byte_count, cudaMemcpyHostToDevice); + // Process image by iterating over all chunks. for (int chunk_index = 0; chunk_index < total_image_chunks; chunk_index++) { logger.info("Iterating over image chunk " + std::to_string(chunk_index)); logger.info("Accounts for pixels " + std::to_string(cuda_chunk_size * chunk_index) + " through " + std::to_string((cuda_chunk_size * chunk_index) + cuda_chunk_size) ); - // Allocate memory on GPU. - PixelStruct* gpu_pixel_arr; - cudaMalloc((void**) &gpu_pixel_arr, struct_byte_count); - cudaMemcpy(gpu_pixel_arr, pixel_arr, struct_byte_count, cudaMemcpyHostToDevice); - - // Run GPU kernel logic. - // cuda_alternating_pixels<<<cuda_device_max_blocks, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); + // Run GPU test logic to set alternating pixels to black/white. + // cuda_alternating_pixels<<<cuda_device_max_blocks, cuda_device_max_threads>>>(image_total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); + // Run GPU test logic to create black/white checkerboard pattern. // int pixel_block_size = width / 10; // if (pixel_block_size <= 1) { // pixel_block_size = 2; // } - // cuda_alternating_blocks<<<cuda_device_max_blocks, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, pixel_block_size, width, height, gpu_pixel_arr); + // cuda_alternating_blocks<<<cuda_device_max_blocks, cuda_device_max_threads>>>(image_total_pixels, cuda_chunk_size, chunk_index, pixel_block_size, width, height, gpu_pixel_arr); + + // Run GPU test logic to increase/decrease r/g/b colors in image.. + // cuda_increase_red<<<cuda_device_max_blocks, cuda_device_max_threads>>>(image_total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); + // cuda_increase_green<<<cuda_device_max_blocks, cuda_device_max_threads>>>(image_total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); + // cuda_increase_blue<<<cuda_device_max_blocks, cuda_device_max_threads>>>(image_total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); - // cuda_increase_red<<<cuda_device_max_blocks, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); - // cuda_increase_green<<<cuda_device_max_blocks, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); - cuda_increase_blue<<<cuda_device_max_blocks, cuda_device_max_threads>>>(total_pixels, cuda_chunk_size, chunk_index, gpu_pixel_arr); + // Run GPU logic to use convolution to create blur effect. + cuda_convolution_blur<<<cuda_device_max_blocks, cuda_device_max_threads>>>(image_total_pixels, cuda_chunk_size, chunk_index, width, height, mask_width, mask_height, gpu_pixel_arr_orig, gpu_pixel_arr, gpu_mask_arr); // Synchronize all GPU computations before iterating over next chunk. cudaDeviceSynchronize(); - - // Retrieve results. - cudaMemcpy(pixel_arr, gpu_pixel_arr, struct_byte_count, cudaMemcpyDeviceToHost); - cudaFree(gpu_pixel_arr); } + // Retrieve results. + cudaMemcpy(pixel_arr, gpu_pixel_arr, pixel_struct_byte_count, cudaMemcpyDeviceToHost); + cudaFree(gpu_pixel_arr); + logger.debug("Image::compute() finished."); logger.debug(""); }