diff --git a/src/kernel.cu b/src/kernel.cu index 2a75e3a..bc47e61 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -20,7 +20,11 @@ __constant__ int gaussian[9]; __constant__ int sobel_x[9]; __constant__ int sobel_y[9]; -__global__ void hysteresisKernel(unsigned char* deviceInput, unsigned char* deviceOutput, int width, int height) + + +// This method is the kernel for the final step of hysteresis thresholding +__global__ void hysteresisKernel(unsigned char* deviceInput, unsigned char* + deviceOutput, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -56,6 +60,10 @@ __global__ void hysteresisKernel(unsigned char* deviceInput, unsigned char* devi } } + + +// This method is responsible for handling memory and calling the kernel for +// the final step of hysteresis thresholding void hysteresisCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) { // Allocate memory on device for input and output @@ -73,9 +81,8 @@ void hysteresisCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows / BLOCK_SIZE), 1); const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); - hysteresisKernel << > > (deviceInput, deviceOutput, hostInput.cols, hostInput.rows); - - + hysteresisKernel << > > (deviceInput, + deviceOutput, hostInput.cols, hostInput.rows); // Copy memory back to host after kernel is complete cudaDeviceSynchronize(); @@ -84,6 +91,10 @@ void hysteresisCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) cudaFree(deviceOutput); } + + +// This method is our CPU powered BFS implementation of the second portion of +// hysteresis thresholding void hysteresisCPU(cv::Mat& hostInput, cv::Mat& hostOutput) { std::queue> strongEdges; @@ -109,21 +120,35 @@ void hysteresisCPU(cv::Mat& hostInput, cv::Mat& hostOutput) { strongEdges.pop(); hostOutput.at(row, col) = 255; - //examine all neighbors + // examine all neighbors, after checking if they are contained within + // the image boundaries std::vector> neighbors; - if (col - 1 > 0 && row - 1 > 0) { neighbors.push_back(std::pair(row - 1, col - 1)); } - if (col - 1 > 0) { neighbors.push_back(std::pair(row, col - 1)); } - if (col - 1 > 0 && row + 1 < hostInput.rows) { neighbors.push_back(std::pair(row + 1, col - 1)); } - if (row - 1 > 0) { neighbors.push_back(std::pair(row - 1, col)); } - if (row + 1 < hostInput.rows) { neighbors.push_back(std::pair(row + 1, col)); } - if (col + 1 < hostInput.cols && row - 1 > 0) { neighbors.push_back(std::pair(row - 1, col + 1)); } - if (col + 1 < hostInput.cols) { neighbors.push_back(std::pair(row, col + 1)); } - if (col + 1 < hostInput.cols && row + 1 < hostInput.rows) { neighbors.push_back(std::pair(row + 1, col + 1)); } - + if (col - 1 > 0 && row - 1 > 0) { neighbors.push_back + (std::pair(row - 1, col - 1)); } + if (col - 1 > 0) { neighbors.push_back + (std::pair(row, col - 1)); } + if (col - 1 > 0 && row + 1 < hostInput.rows) { neighbors.push_back + (std::pair(row + 1, col - 1)); } + if (row - 1 > 0) { neighbors.push_back(std::pair + (row - 1, col)); } + if (row + 1 < hostInput.rows) { neighbors.push_back + (std::pair(row + 1, col)); } + if (col + 1 < hostInput.cols && row - 1 > 0) { neighbors.push_back + (std::pair(row - 1, col + 1)); } + if (col + 1 < hostInput.cols) { neighbors.push_back(std::pair + (row, col + 1)); } + if (col + 1 < hostInput.cols && row + 1 < hostInput.rows) { + neighbors.push_back(std::pair(row + 1, col + 1)); } + + // For each neighbor, if it is a weak edge, make it a strong edge and + // queue it for (int i = 0; i < neighbors.size(); i++) { - if (hostInput.at(neighbors[i].first, neighbors[i].second) == 128) { - strongEdges.push(std::pair(neighbors[i].first, neighbors[i].second)); - hostInput.at(neighbors[i].first, neighbors[i].second) = 255; + if (hostInput.at(neighbors[i].first, neighbors[i].second) == + 128) { + strongEdges.push(std::pair(neighbors[i].first, + neighbors[i].second)); + hostInput.at(neighbors[i].first, neighbors[i].second) = + 255; } } @@ -131,7 +156,12 @@ void hysteresisCPU(cv::Mat& hostInput, cv::Mat& hostOutput) { } -__global__ void thresholdingKernel(unsigned char* deviceInput, unsigned char* deviceOutput, int width, int height) { + + +// This method is the kernel for the first step of hysteresis thresholding +// populating the output image with only non, weak, and strong edge indicators +__global__ void thresholdingKernel(unsigned char* deviceInput, + unsigned char* deviceOutput, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -159,6 +189,10 @@ __global__ void thresholdingKernel(unsigned char* deviceInput, unsigned char* de } + + +// This method is responsible for handling memory and calling the kernel that +// does the first portion of hysteresis thresholding void thresholdingCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) { // Allocate memory on device for input and output unsigned char* deviceInput; @@ -174,9 +208,8 @@ void thresholdingCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) { const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows / BLOCK_SIZE), 1); const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); - thresholdingKernel << > > (deviceInput, deviceOutput, hostInput.cols, hostInput.rows); - - + thresholdingKernel << > > (deviceInput, + deviceOutput, hostInput.cols, hostInput.rows); // Copy memory back to host after kernel is complete cudaDeviceSynchronize(); @@ -186,7 +219,10 @@ void thresholdingCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) { } -__global__ void nonMaximaSuppressionKernel(unsigned char* deviceInput, unsigned char* deviceOutput, float* angles, int width, int height) + +// This method is the kernel that handles non-maxima suppression +__global__ void nonMaximaSuppressionKernel(unsigned char* deviceInput, + unsigned char* deviceOutput, float* angles, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -235,7 +271,12 @@ __global__ void nonMaximaSuppressionKernel(unsigned char* deviceInput, unsigned } } -void nonMaximaSuppressionCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, float* hostAngles) + + +// This method is responsible for memory managment and calling the kernel +// that performs the non-maxima suppression on a frame +void nonMaximaSuppressionCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, + float* hostAngles) { // Allocate memory on device for input and output unsigned char* deviceInput; @@ -255,9 +296,11 @@ void nonMaximaSuppressionCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, flo cudaMemcpy(deviceAngles, hostAngles, anglesBytes, cudaMemcpyHostToDevice); // Call the kernel to apply non-maxima suppression - const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows / BLOCK_SIZE), 1); + const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), + ceil(hostInput.rows / BLOCK_SIZE), 1); const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); - nonMaximaSuppressionKernel << > >(deviceInput, deviceOutput, deviceAngles, hostInput.cols, hostInput.rows); + nonMaximaSuppressionKernel << > >(deviceInput, + deviceOutput, deviceAngles, hostInput.cols, hostInput.rows); // Copy memory back to host after kernel is complete cudaDeviceSynchronize(); @@ -267,7 +310,11 @@ void nonMaximaSuppressionCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, flo cudaFree(deviceAngles); } -__global__ void sobelKernel(unsigned char* deviceInput, unsigned char* deviceOutput, float* deviceAngles, int width, int height) + + +// This kernel creates an intensity gradient using the sobel operator +__global__ void sobelKernel(unsigned char* deviceInput, + unsigned char* deviceOutput, float* deviceAngles, int width, int height) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; @@ -291,10 +338,15 @@ __global__ void sobelKernel(unsigned char* deviceInput, unsigned char* deviceOut (sobel_y[7] * deviceInput[(y + 1) * width + ( x )]) + (sobel_y[8] * deviceInput[(y + 1) * width + (x + 1)]); - deviceOutput[y * width + x] = static_cast(sqrt((float)(gx * gx) + (gy * gy))); + deviceOutput[y * width + x] = + static_cast(sqrt((float)(gx * gx) + (gy * gy))); deviceAngles[y * width + x] = atan((float) (gy / gx)); } + + +// This method is responsible for managing memory and calling the kernel +// that creates an intensity gradient using the sobel operator void sobelCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, float* hostAngles) { // Allocate memory on device for input and output @@ -319,9 +371,11 @@ void sobelCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, float* hostAngles) cudaMemcpyToSymbol(sobel_y, h_sobel_y, 9 * sizeof(int)); // Call the kernel to apply the Sobel filter - const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows / BLOCK_SIZE), 1); + const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows + / BLOCK_SIZE), 1); const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); - sobelKernel << > > (deviceInput, deviceOutput, deviceAngles, hostInput.cols, hostInput.rows); + sobelKernel << > > (deviceInput, deviceOutput, + deviceAngles, hostInput.cols, hostInput.rows); // Copy memory back to host after kernel is complete cudaDeviceSynchronize(); @@ -333,7 +387,11 @@ void sobelCuda(const cv::Mat& hostInput, cv::Mat& hostOutput, float* hostAngles) } -__global__ void gaussianKernel(unsigned char* deviceInput, unsigned char* deviceOutput, int width, int height) + +// This kernel performs a gaussian blur on the image deviceInput and outputs it +// to deviceOutput +__global__ void gaussianKernel(unsigned char* deviceInput, unsigned char* + deviceOutput, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -356,6 +414,9 @@ __global__ void gaussianKernel(unsigned char* deviceInput, unsigned char* device } + +// This method is responsible for managing memory and calling the kernel that +// performs gaussian blur void gaussianCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) { // Allocate memory on device for input and output @@ -373,9 +434,11 @@ void gaussianCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) cudaMemcpyToSymbol(gaussian, hostGaussian, 9 * sizeof(int)); // Call the kernel to convert the image to grayscale - const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows / BLOCK_SIZE), 1); + const dim3 numBlocks(ceil(hostInput.cols / BLOCK_SIZE), ceil(hostInput.rows + / BLOCK_SIZE), 1); const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); - gaussianKernel << < numBlocks, threadsPerBlock >> > (deviceInput, deviceOutput, hostInput.cols, hostInput.rows); + gaussianKernel << < numBlocks, threadsPerBlock >> > (deviceInput, + deviceOutput, hostInput.cols, hostInput.rows); // Copy memory back to host after kernel is complete cudaDeviceSynchronize(); @@ -385,8 +448,10 @@ void gaussianCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) } -__global__ void grayscaleKernel(unsigned char* rgbInput, unsigned char* grayOutput, - int width, int height, int colorWidthStep, int grayWidthStep) +// This kernel is responsible for converting the color frame into a greyscale +// image, and outputing it to grayOutput +__global__ void grayscaleKernel(unsigned char* rgbInput, unsigned char* + grayOutput, int width, int height, int colorWidthStep, int grayWidthStep) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -405,6 +470,10 @@ __global__ void grayscaleKernel(unsigned char* rgbInput, unsigned char* grayOutp } } + + +// This method is responsible for managing memory and calling the kernel that +// performs the coversion from color to a greyscale image void grayscaleCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) { // Allocate memory on device for input and output @@ -420,8 +489,10 @@ void grayscaleCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) // Call the kernel to convert the image to grayscale const dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE, 1); - const dim3 gridSize((hostInput.cols + blockSize.x - 1) / blockSize.x, (hostInput.rows + blockSize.y - 1) / blockSize.y, 1); - grayscaleKernel << > > (deviceInput, deviceOutput, hostInput.cols, hostInput.rows, hostInput.step, hostOutput.step); + const dim3 gridSize((hostInput.cols + blockSize.x - 1) / blockSize.x, + (hostInput.rows + blockSize.y - 1) / blockSize.y, 1); + grayscaleKernel << > > (deviceInput, deviceOutput, + hostInput.cols, hostInput.rows, hostInput.step, hostOutput.step); // Copy memory back to host after kernel is complete cudaDeviceSynchronize(); @@ -430,10 +501,14 @@ void grayscaleCuda(const cv::Mat& hostInput, cv::Mat& hostOutput) cudaFree(deviceOutput); } -// This function takes in a path to a video file (which are passed in as command line args to main) -// as the first parameter and outputs each extracted frame to a vector of Mat items which is passed -// in as the second parameter to the function. -void extractFrames(const std::string& videoFilePath, std::vector& framesOut) + + +// This function takes in a path to a video file (which are passed in as command +// line args to main) as the first parameter and outputs each extracted frame +// to a vector of Mat items which is passed in as the second parameter to the +// function. +void extractFrames(const std::string& videoFilePath, std::vector& + framesOut) { try { @@ -443,7 +518,8 @@ void extractFrames(const std::string& videoFilePath, std::vector& frame std::cerr << "Unable to open video file!" << std::endl; return; } - for (int frameNum = 0; frameNum < cap.get(cv::CAP_PROP_FRAME_COUNT); frameNum++) + for (int frameNum = 0; frameNum < cap.get(cv::CAP_PROP_FRAME_COUNT); + frameNum++) { cv::Mat frame; cap >> frame; @@ -458,6 +534,7 @@ void extractFrames(const std::string& videoFilePath, std::vector& frame } + // This function accepts a single frame and detects edges in it using opencv // Canny(). It returns the edge detected image. cv::Mat opencvCanny(const cv::Mat& frame) { @@ -475,6 +552,7 @@ cv::Mat opencvCanny(const cv::Mat& frame) { } + // This function accepts a single frame and performs a hough transform on it // Returns a vector of the lines that were detected void houghTransform(const cv::Mat& frame, std::vector& houghLines) { @@ -483,6 +561,8 @@ void houghTransform(const cv::Mat& frame, std::vector& houghLines) { return; } + + // This method determines the two best candidates out of all the lines picked // up by the hough transform for the left and right lane, then draws them // on the original color frame image @@ -540,8 +620,6 @@ cv::Mat drawLines(const cv::Mat& frame, std::vector& houghLines) { lanes.push_back(houghLines[rightLaneCandidate]); // Draw the lines - // Code for drawing lines on an image pulled from houghlines.cpp in opencv - // tutorials and adapted for our purpose for (size_t i = 0; i < lanes.size(); i++) { // elements of this polar coordinate line @@ -568,144 +646,187 @@ cv::Mat drawLines(const cv::Mat& frame, std::vector& houghLines) { // testing line to show theta values of final lines //std::cerr << "Theta = " << theta << std::endl; - } return output; } -/* -void grayscaleOptimized(unsigned char* deviceInput, unsigned char* deviceOutput, int width, int height) -{ - const dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE, 1); - const dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, 1); - grayscaleKernel << > > (deviceInput, deviceOutput, hostInput.cols, hostInput.rows, hostInput.step, hostOutput.step); -}*/ -cv::Mat gpuOptimized(const cv::Mat &frame) + +// This version of our GPU canny edge detetion path is an attempt at optimizing +// the process by keeping images in device memory and reducing copies from +// host to device and back. +cv::Mat gpuOptimized(const cv::Mat &frame, bool debug) { int width = frame.cols; int height = frame.rows; int cols = frame.cols; int rows = frame.rows; - - //int rgb_bytes = frame.rows * frame.step; - //int rgb_bytes = rows * cols * sizeof(unsigned char) * CHANNELS; + cv::Mat output = cv::Mat(height, width, CV_8UC1); int rgb_bytes = frame.step * frame.rows; - //int rgb_bytes = rows * cols * sizeof(unsigned char) * CHANNELS; int bytes = rows * cols * sizeof(unsigned char); - cv::Mat output = cv::Mat(height, width, CV_8UC1); - // Allocate memory on device for input and output - unsigned char* deviceInput; + /******************************************************************* + * RGB TO GRAYSCALE CONVERSION + *******************************************************************/ + unsigned char* grayscaleInput; unsigned char* grayscaleOutput; - cudaMalloc(&deviceInput, rows * cols * sizeof(unsigned char) * CHANNELS); + cudaMalloc(&grayscaleInput, rows * cols * sizeof(unsigned char) * CHANNELS); cudaMalloc(&grayscaleOutput, bytes); - - cv::imshow("Frame", frame); - cv::waitKey(0); - //cudaDeviceSynchronize(); - - // Copy host memory to device - cudaMemcpy(deviceInput, frame.ptr(), rgb_bytes, cudaMemcpyHostToDevice); - /*cv::Mat copyback = cv::Mat(height, width, CV_8UC1); - cudaMemcpy(copyback.ptr(), grayscaleOutput, bytes, cudaMemcpyDeviceToHost); - cv::imshow("Grayscale", copyback); - cv::waitKey(0);*/ - - // Set up block configuration for RGB to grayscale + cudaMemcpy(grayscaleInput, frame.ptr(), rgb_bytes, cudaMemcpyHostToDevice); const dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE, 1); - const dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, 1); - - grayscaleKernel << > > (deviceInput, grayscaleOutput, width, height, frame.step, output.step); - - cv::Mat grayscale = cv::Mat(height, width, CV_8UC1); - cudaMemcpy(grayscale.ptr(), grayscaleOutput, bytes, cudaMemcpyDeviceToHost); - - cv::imshow("OPTIMIZED Grayscale", grayscale); - cv::waitKey(0); + const dim3 gridSize((width + blockSize.x - 1) / blockSize.x, + (height + blockSize.y - 1) / blockSize.y, 1); + grayscaleKernel << > > (grayscaleInput, + grayscaleOutput, width, height, frame.step, output.step); cudaDeviceSynchronize(); + if (debug) + { + cv::Mat grayscale = cv::Mat(height, width, CV_8UC1); + cudaMemcpy(grayscale.ptr(), grayscaleOutput, bytes, cudaMemcpyDeviceToHost); + cv::imshow("OPTIMIZED Grayscale", grayscale); + cv::waitKey(0); + } + /******************************************************************* + * GAUSSIAN BLUR + *******************************************************************/ unsigned char* gaussianInput; cudaMalloc(&gaussianInput, bytes); cudaMemcpy(gaussianInput, grayscaleOutput, bytes, cudaMemcpyDeviceToDevice); - - // GAUSSIAN - allocate memory unsigned char* gaussianOutput; cudaMalloc(&gaussianOutput, bytes); - - // GAUSSIAN - copy kernel values to global memory int hostGaussian[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; cudaMemcpyToSymbol(gaussian, hostGaussian, 9 * sizeof(int)); - - // GAUSSIAN - set up kernel call configuration const dim3 numBlocks(ceil(cols / BLOCK_SIZE), ceil(rows / BLOCK_SIZE), 1); const dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); - gaussianKernel << < numBlocks, threadsPerBlock >> > (gaussianInput, gaussianOutput, cols, rows); - // GAUSSIAN - copy device output to host - cv::Mat gaussian = cv::Mat(height, width, CV_8UC1); + gaussianKernel << < numBlocks, threadsPerBlock >> > (gaussianInput, + gaussianOutput, cols, rows); cudaDeviceSynchronize(); - cudaMemcpy(gaussian.ptr(), gaussianOutput, bytes, cudaMemcpyDeviceToHost); - cudaFree(grayscaleOutput); - cv::imshow("OPTIMIZED Gaussian", gaussian); - cv::waitKey(0); - + cudaFree(gaussianInput); + if (debug) + { + cv::Mat gaussian = cv::Mat(height, width, CV_8UC1); + //cudaDeviceSynchronize(); + cudaMemcpy(gaussian.ptr(), gaussianOutput, bytes, cudaMemcpyDeviceToHost); + cv::imshow("OPTIMIZED Gaussian", gaussian); + cv::waitKey(0); + } + + /******************************************************************* + * SOBEL OPERATOR + *******************************************************************/ + unsigned char* sobelInput; + cudaMalloc(&sobelInput, bytes); + cudaMemcpy(sobelInput, gaussianOutput, bytes, cudaMemcpyDeviceToDevice); unsigned char* sobelOutput; - cudaMalloc((void**)&sobelOutput, bytes); - float* angles; - cudaMalloc((void **) &angles, rows * cols * sizeof(float)); + cudaMalloc(&sobelOutput, bytes); + float* sobelAngles; + cudaMalloc(&sobelAngles, rows * cols * sizeof(float)); int h_sobel_x[9] = { 1, 0, -1, 2, 0, -2, 1, 0, -1 }; int h_sobel_y[9] = { 1, 2, 1, 0, 0, 0, -1, -2, -1 }; cudaMemcpyToSymbol(sobel_x, h_sobel_x, 9 * sizeof(int)); - cudaMemcpyToSymbol(sobel_y, h_sobel_y, 9 * sizeof(int)); - - unsigned char* sobelInput; - cudaMalloc(&sobelInput, bytes); - cudaMemcpy(sobelInput, gaussianOutput, bytes, cudaMemcpyDeviceToDevice); - sobelKernel << > > (sobelInput, sobelOutput, angles, cols, rows); + cudaMemcpyToSymbol(sobel_y, h_sobel_y, 9 * sizeof(int)); + sobelKernel << > > (sobelInput, sobelOutput, + sobelAngles, cols, rows); cudaDeviceSynchronize(); + cudaFree(gaussianOutput); + cudaFree(sobelInput); + if (debug) + { + cv::Mat sobel = cv::Mat(height, width, CV_8UC1); + //cudaDeviceSynchronize(); + cudaMemcpy(sobel.ptr(), sobelOutput, bytes, cudaMemcpyDeviceToHost); + cv::imshow("OPTIMIZED Sobel", sobel); + cv::waitKey(0); + } + + /******************************************************************* + * NON-MAXIMA SUPPRESSION + *******************************************************************/ unsigned char* nmsInput; cudaMalloc(&nmsInput, bytes); cudaMemcpy(nmsInput, sobelOutput, bytes, cudaMemcpyDeviceToDevice); unsigned char* nmsOutput; - cudaMalloc((void**)&nmsOutput, bytes); - nonMaximaSuppressionKernel << > > (nmsInput, nmsOutput, angles, cols, rows); + cudaMalloc(&nmsOutput, bytes); + cudaMemcpy(nmsOutput, nmsInput, bytes, cudaMemcpyDeviceToDevice); + float* nmsAngles; + cudaMalloc(&nmsAngles, rows * cols * sizeof(float)); + cudaMemcpy(nmsAngles, sobelAngles, rows * cols * sizeof(float), + cudaMemcpyDeviceToDevice); + nonMaximaSuppressionKernel << > > (nmsInput, + nmsOutput, nmsAngles, cols, rows); cudaDeviceSynchronize(); + cudaFree(sobelOutput); + cudaFree(sobelAngles); + cudaFree(nmsInput); + if (debug) + { + cv::Mat nms = cv::Mat(height, width, CV_8UC1); + cudaMemcpy(nms.ptr(), nmsOutput, bytes, cudaMemcpyDeviceToHost); + cv::imshow("OPTIMIZED nms", nms); + cv::waitKey(0); + } + /******************************************************************* + * HYSTERESIS THESHOLD - STAGE 1 + *******************************************************************/ unsigned char* thresholdInput; cudaMalloc(&thresholdInput, bytes); cudaMemcpy(thresholdInput, nmsOutput, bytes, cudaMemcpyDeviceToDevice); unsigned char* thresholdOutput; - cudaMalloc((void**)&thresholdOutput, bytes); - thresholdingKernel << > > (thresholdInput, thresholdOutput, cols, rows); + cudaMalloc(&thresholdOutput, bytes); + thresholdingKernel << > > (thresholdInput, + thresholdOutput, cols, rows); cudaDeviceSynchronize(); + cudaFree(nmsOutput); + cudaFree(nmsAngles); + cudaFree(thresholdInput); + if (debug) + { + cv::Mat threshold = cv::Mat(height, width, CV_8UC1); + cudaMemcpy(threshold.ptr(), thresholdOutput, bytes, cudaMemcpyDeviceToHost); + cv::imshow("OPTIMIZED threshold", threshold); + cv::waitKey(0); + } + + /******************************************************************* + * HYSTERESIS THESHOLD - STAGE 2 + *******************************************************************/ unsigned char* hysteresisInput; cudaMalloc(&hysteresisInput, bytes); cudaMemcpy(hysteresisInput, thresholdOutput, bytes, cudaMemcpyDeviceToDevice); unsigned char* hysteresisOutput; - cudaMalloc((void**)&hysteresisOutput, bytes); - hysteresisKernel << > > (hysteresisInput, hysteresisOutput, cols, rows); + cudaMalloc(&hysteresisOutput, bytes); + cudaMemcpy(hysteresisOutput, hysteresisInput, bytes, cudaMemcpyDeviceToDevice); + hysteresisKernel << > > (hysteresisInput, + hysteresisOutput, cols, rows); cudaDeviceSynchronize(); - - //cudaDeviceSynchronize(); + cudaFree(thresholdOutput); + cudaFree(hysteresisInput); + if (debug) + { + cv::Mat hysteresis = cv::Mat(height, width, CV_8UC1); + cudaMemcpy(hysteresis.ptr(), hysteresisOutput, bytes, + cudaMemcpyDeviceToHost); + cv::imshow("OPTIMIZED hysteresis", hysteresis); + cv::waitKey(0); + } cudaMemcpy(output.ptr(), hysteresisOutput, bytes, cudaMemcpyDeviceToHost); - - cudaFree(deviceInput); - - cudaFree(gaussianOutput); - cudaFree(sobelOutput); - cudaFree(angles); - cudaFree(nmsOutput); - cudaFree(thresholdOutput); cudaFree(hysteresisOutput); - + cudaThreadExit(); return output; } + + +// This method represents our GPU accelerated canny edge detection +// implementation. It handles calling the different methods for the various +// steps that make up that process, and showing intermediate images if in demo cv::Mat gpuCanny(const cv::Mat &frame, bool demo) { cv::Mat image = frame.clone(); const int rows = image.rows; @@ -716,56 +837,48 @@ cv::Mat gpuCanny(const cv::Mat &frame, bool demo) { // convert the image to grayscale cv::Mat grayscale = cv::Mat(rows, cols, CV_8UC1); grayscaleCuda(image, grayscale); - // VISUAL DEBUG: compare our implementation with openCV implementation - /* - cv::Mat opencv_grayscale = cv::Mat(rows, cols, CV_8UC1); - cvtColor(image, opencv_grayscale, cv::COLOR_RGB2GRAY); - imshow("Grayscale Image", grayscale); - cv::waitKey(0); - imshow("openCV Grayscale Image", opencv_grayscale); - cv::waitKey(0); - */ + if (demo) + { + imshow("Grayscale Image", grayscale); + cv::waitKey(0); + } // apply the Gaussian filter cv::Mat blurred = cv::Mat(rows, cols, CV_8UC1); gaussianCuda(grayscale, blurred); - // VISUAL DEBUG: compare our implementation with openCV implementation - /* - cv::Mat opencv_blurred = cv::Mat(rows, cols, CV_8UC1); - cv::GaussianBlur(grayscale, opencv_blurred, cv::Size(3, 3), 0); - imshow("Blurred Image", blurred); - cv::waitKey(0); - imshow("openCV Blurred Image", opencv_blurred); - cv::waitKey(0); - */ + if (demo) + { + imshow("Blurred Image", blurred); + cv::waitKey(0); + } // apply the Sobel operator cv::Mat sobel = cv::Mat(rows, cols, CV_8UC1); float* angles = (float*)malloc(rows * cols * sizeof(float)); sobelCuda(blurred, sobel, angles); - //imshow("Intensity Gradient Image", sobel); - //cv::waitKey(); - // VISUAL DEBUG: compare our implementation with openCV implementation - /* - cv::Mat opencv_sobel = cv::Mat(rows, cols, CV_8UC1); - cv::Sobel(blurred, opencv_sobel, CV_8UC1, 1, 1); - imshow("Intensity Gradient Image", sobel); - cv::waitKey(); - imshow("openCV Intensity Gradient", opencv_sobel); - cv::waitKey(); - */ + if (demo) + { + imshow("Intensity Gradient Image", sobel); + cv::waitKey(0); + } // apply non-maxima suppression cv::Mat nms = cv::Mat(rows, cols, CV_8UC1); nonMaximaSuppressionCuda(sobel, nms, angles); - //imshow("Non-Maxima Suppression Image", nms); - - + if (demo) + { + imshow("Non-Maxima Suppression Image", nms); + cv::waitKey(0); + } + // perform hysteresis thresholding - stage 1 cv::Mat threshold = cv::Mat(rows, cols, CV_8UC1); thresholdingCuda(nms, threshold); - //imshow("Hysteresis Thresholded Image - Stage 1", threshold); - //cv::waitKey(); + if (demo) + { + imshow("Hysteresis Thresholded Image - Stage 1", threshold); + cv::waitKey(0); + } // perform hysteresis thresholding - stage 2 cv::Mat hysteresis = cv::Mat(rows, cols, CV_8UC1); @@ -773,40 +886,13 @@ cv::Mat gpuCanny(const cv::Mat &frame, bool demo) { if (demo) { imshow("Hysteresis Thresholded Image - Stage 2", hysteresis); - cv::waitKey(); + cv::waitKey(0); } - /* - cv::Mat canny = cv::Mat(rows, cols, CV_8UC1); - canny = opencvCanny(frame); - imshow("openCV Canny", canny); - cv::waitKey(); - */ return hysteresis; } -std::string type2str(int type) { - std::string r; - - uchar depth = type & CV_MAT_DEPTH_MASK; - uchar chans = 1 + (type >> CV_CN_SHIFT); - - switch (depth) { - case CV_8U: r = "8U"; break; - case CV_8S: r = "8S"; break; - case CV_16U: r = "16U"; break; - case CV_16S: r = "16S"; break; - case CV_32S: r = "32S"; break; - case CV_32F: r = "32F"; break; - case CV_64F: r = "64F"; break; - default: r = "User"; break; - } - - r += "C"; - r += (chans + '0'); - return r; -} // COMMAND LINE ARGUMENTS // argv[0] = program name @@ -832,76 +918,78 @@ int main(int argc, char** argv) } else { - std::cerr << "Invalid command line arguments!1" << std::endl; + std::cerr << "Invalid command line arguments!" << std::endl; return -1; } bool demo = false; - if (argc < 3) + bool debug = false; + if (argc > 3) { std::string demo_arg = argv[3]; if (demo_arg == "demo") { demo = true; } + else if (demo_arg == "debug") + { + debug = true; + } else { - std::cerr << "Invalid command line arguments!2" << std::endl; + std::cerr << "Invalid command line arguments!" << std::endl; return -1; } } - + // extract the video frames into a vector std::vector framesOutput; extractFrames(videoFilePath, framesOutput); + // start timing for the total run time including hough auto totalStart = std::chrono::high_resolution_clock::now(); - //std::chrono::high_resolution_clock::duration gpuTime = std::chrono::high_resolution_clock::rep(std::chrono::duration_values::zero); - //auto gpuTime = std::chrono::high_resolution_clock::duration::zero; - //auto houghTime = std::chrono::high_resolution_clock::duration::zero; std::chrono::milliseconds opencvTime(0); std::chrono::milliseconds gpuTime(0); std::chrono::milliseconds houghTime(0); - //std::chrono::high_resolution_clock::duration houghTime{}; - // + // loop through each fram for (int i = 0; i < framesOutput.size(); i++) { + // create Mat to hold the edges from canny edge detection cv::Mat edges; - - std::string type = type2str(framesOutput[i].type()); int size = framesOutput.size(); + // This section is for when using the opencvCanny() implementation // path (non-GPU accelarated) if (!gpuAccelerated) { - // create Mat to hold the edges from canny edge detection + // start timing for OpenCV canny edge detection auto opencvFrameStart = std::chrono::high_resolution_clock::now(); edges = opencvCanny(framesOutput[i]); auto opencvFrameEnd = std::chrono::high_resolution_clock::now(); - auto opencvFrameMs = std::chrono::duration_cast(opencvFrameEnd - opencvFrameStart); + auto opencvFrameMs = std::chrono::duration_cast + (opencvFrameEnd - opencvFrameStart); opencvTime += opencvFrameMs; - //imshow("Edge Detected Frame", edges); - //cv::waitKey(0); } // This section is for when using our own GPU accelerated path else { - // create Mat to hold the edges from canny edge detection + // start timing for GPU edge detecton auto gpuFrameStart = std::chrono::high_resolution_clock::now(); if (demo == true) { edges = gpuCanny(framesOutput[i], demo); } - else + else if (debug == true) { - edges = gpuOptimized(framesOutput[i]); - imshow("Edge Detected Frame", edges); - cv::waitKey(0); + + edges = gpuOptimized(framesOutput[i], debug); + } + else { + edges = gpuCanny(framesOutput[i], demo); } auto gpuFrameEnd = std::chrono::high_resolution_clock::now(); - auto gpuFrameMs = std::chrono::duration_cast(gpuFrameEnd - gpuFrameStart); + auto gpuFrameMs = std::chrono::duration_cast + (gpuFrameEnd - gpuFrameStart); gpuTime += gpuFrameMs; - // imshow("Edge Detected Frame", edges); - // cv::waitKey(0); } // perform hough transform, storing lines detected in houghLines vector @@ -909,7 +997,8 @@ int main(int argc, char** argv) auto houghStart = std::chrono::high_resolution_clock::now(); houghTransform(edges, houghLines); auto houghEnd = std::chrono::high_resolution_clock::now(); - houghTime += std::chrono::duration_cast(houghEnd - houghStart); + houghTime += std::chrono::duration_cast + (houghEnd - houghStart); if (demo) { @@ -917,21 +1006,28 @@ int main(int argc, char** argv) cv::waitKey(0); } } - + + // end the timing for total run time including hough (for this frame) auto totalEnd = std::chrono::high_resolution_clock::now(); //std::chrono::duration totalTime = totalEnd - totalStart; - auto totalMilliseconds = std::chrono::duration_cast(totalEnd - totalStart); - std::cout << "Total execution time: " << totalMilliseconds.count() << " milliseconds" << std::endl; + auto totalMilliseconds = std::chrono::duration_cast + (totalEnd - totalStart); + std::cout << "Total execution time: " << totalMilliseconds.count() << + " milliseconds" << std::endl; if (gpuAccelerated) { - std::cout << "GPU Canny execution time (CUDA Kernels): " << gpuTime.count() << " milliseconds" << std::endl; - std::cout << "CPU Hough transform execution time: " << houghTime.count() << "milliseconds" << std::endl; + std::cout << "GPU Canny execution time (CUDA Kernels): " << + gpuTime.count() << " milliseconds" << std::endl; + std::cout << "CPU Hough transform execution time: " << houghTime.count() + << "milliseconds" << std::endl; } if (!gpuAccelerated) { - std::cout << "CPU openCV::Canny() execution time: " << opencvTime.count() << " milliseconds" << std::endl; - std::cout << "CPU Hough transform execution time: " << houghTime.count() << "milliseconds" << std::endl; + std::cout << "CPU openCV::Canny() execution time: " << + opencvTime.count() << " milliseconds" << std::endl; + std::cout << "CPU Hough transform execution time: " << houghTime.count() + << " milliseconds" << std::endl; } cv::destroyAllWindows(); return 0; diff --git a/testvideo.mp4 b/testvideo.mp4 index 93942c9..acbc1ee 100644 Binary files a/testvideo.mp4 and b/testvideo.mp4 differ diff --git a/testvideob.mp4 b/testvideob.mp4 deleted file mode 100644 index acbc1ee..0000000 Binary files a/testvideob.mp4 and /dev/null differ diff --git a/testvideoc.mp4 b/testvideoc.mp4 deleted file mode 100644 index 1cdd10b..0000000 Binary files a/testvideoc.mp4 and /dev/null differ