紋理一詞來源于GPU圖形世界,GPU通用并行計算“盜用”了紋理一詞,定義了一個紋理內存的概念。紋理內存緩存在 設備上,在某些情況下能減少對內存的請求并降低內存帶寬的使用,是專門為那些在內存訪問模式中存在大量空間局部性的圖形應用而設計,意味著一個線程讀取的位置可能與鄰近線程讀取的位置“非常接近”。對于GPU內核而言,紋理內存是只讀內存,并且只有通過特殊的紋理API才能對其訪問。
紋理內存分為一維紋理內存和二維紋理內存,理解紋理內存最好的方式是丟掉“紋理”兩個字,紋理內存本質上是一塊內存,是GPU在特定應用中對一維、二維變量的特殊聲明定義以及特殊使用,這種特殊使用能夠減少內存流量,提升運算性能。
紋理變量(引用)必須聲明為文件作用域內的全局變量,這里先探討一下一維紋理內存的使用方法。一維紋理內存的關鍵操作如下:
1、用texture<類型>類型聲明。如聲明一個unsigned char 型的一維紋理tex1,格式為:
texture<unsigned char,1,cudaReadmodeElementType> tex1;
2、通過cudaBindTexture()綁定到紋理內存中,并關聯到對應的數據上。如將unsigned char類型的dev_A綁定到一維紋理tex1上,格式為:
cudaBindTexture(0,tex1,dev_A);
注意一旦將數據綁定到一個紋理內存上,該數據就已經傳輸到了設備緩存上,在核函數中就可以直接訪問,不再需要額外傳入。
3、 通過tex1Dfetch()來讀取紋理內存中的數據。紋理內存是一種特殊的內存,需要使用特定的紋理API來訪問其中的數據。如訪問tex1數組的第3個元素,格式為:
tex1Dfetch(tex1,2);
4、 通過cudaUnbindTexture()取消綁定紋理內存。紋理內存使用完之后需要取消綁定,釋放空間,如解除紋理tex1的綁定,格式為:
cudaUnbindTexture(tex1);
考慮一個簡單的應用,把一個長度是100的向量A中的數據拷貝到一個向量B中,使用普通CPU編程實現如下:
#include <iostream>using namespace std;#define _length 100//CPU函數實現復制一個數組void Copy_CPU(unsigned int * listSource, unsigned int * listTarget, int length){ for (int i = 0; i < length; i++) { listTarget[i] = listSource[i]; }}int main(){ unsigned int * listSource = new unsigned int[_length]; unsigned int * listTarget = new unsigned int[_length]; //賦值 for (int i = 0; i < _length; i++) { listSource[i] = i; } //調用CPU復制函數 Copy_CPU(listSource, listTarget, _length); cout << "原始數據: "; for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } cout << endl << endl << "通過CPU拷貝的數據: "; for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } getchar();}運行結果:
使用GPU編程,普通變量編程實現:
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<iostream>#define _length 100using namespace std;//聲明要調用的Copy_GPU函數extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);void main(int argc, char** argv){ unsigned int *listSource = new unsigned int[_length]; unsigned int *listTarget = new unsigned int[_length]; //賦值 for (int i = 0; i < _length; i++) { listSource[i] = i; } // 調用Copy_GPU函數,Copy_GPU中會調用gpu端的kernel函數 Copy_GPU(listSource, listTarget, _length); cout << "原始數據: "; for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } cout << endl << endl << "通過GPU普通內存拷貝的數據: "; for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } getchar();}//核心代碼,在gpu端執行的kernel,__global__ void Blending_Texture(unsigned int* listSource, unsigned int* listTarget, int size){ //通過線程ID得到數組下標 int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < size) listTarget[index] = listSource[index];}void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length){ int data_size = length * sizeof(unsigned int); unsigned int *dev_Source; unsigned int *dev_Target; //在設備上申請顯存空間 cudaMalloc((void**)&dev_Source, data_size); cudaMalloc((void**)&dev_Target, data_size); //將host端的數據拷貝到device端 cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice); //調用kernel Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Source, dev_Target, _length); //將結果拷貝到host端 ☆host就是CPU cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost); //釋放內存空間 cudaFree(dev_Source); cudaFree(dev_Target);}運行結果:
使用GPU編程,一維紋理變量編程實現:
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<iostream>#define _length 100using namespace std;//聲明紋理,用來綁定紋理,其實也就是個紋理標識texture<unsigned int, 1, cudaReadModeElementType> rT1;//聲明要調用的Copy_GPU函數extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);void main(int argc, char** argv){ unsigned int *listSource = new unsigned int[_length]; unsigned int *listTarget = new unsigned int[_length]; //賦值 for (int i = 0; i < _length; i++) { listSource[i] = i; } // 調用Copy_GPU函數,Copy_GPU中會調用gpu端的kernel函數 Copy_GPU(listSource, listTarget, _length); cout << "原始數據: "; for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } cout << endl << endl << "通過GPU紋理內存拷貝的數據: "; for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } getchar();}//核心代碼,在gpu端執行的kernel,__global__ void Blending_Texture(unsigned int* listTarget, int size){ //通過線程ID得到數組下標 int index = blockIdx.x * blockDim.x + threadIdx.x; //通過紋理獲取函數得到數據再運算 if (index < size) listTarget[index] = tex1Dfetch(rT1, index);}void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length){ int data_size = length * sizeof(unsigned int); unsigned int *dev_Source; unsigned int *dev_Target; //在設備上申請顯存空間 cudaMalloc((void**)&dev_Source, data_size); cudaMalloc((void**)&dev_Target, data_size); //將host端的數據拷貝到device端 cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice); //綁定紋理,綁定的紋理標識對應的數據 cudaBindTexture(0, rT1, dev_Source); //調用kernel Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Target, _length); //將結果拷貝到host端 ☆host就是CPU cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost); //取消綁定 cudaUnbindTexture(rT1); //釋放內存空間 cudaFree(dev_Source); cudaFree(dev_Target);}運行結果:
再舉一個使用CUDA+OpenCv編程,實現復制一幅圖像的例子:
#include"cuda_runtime.h"#include<iostream>#include<highgui/highgui.hpp>#include<imgPRoc/imgproc.hpp>#define DIM 512 //圖像尺寸using namespace std;using namespace cv;//一維紋理聲明texture<unsigned char, 1, cudaReadModeElementType> rT1;__global__ void Kernel_Copy(unsigned char* imageTarget){ int x = threadIdx.x + blockIdx.x*blockDim.x; int y = threadIdx.y + blockIdx.y*blockDim.y; int offset = x + y*blockDim.x*gridDim.x; //復制圖像 imageTarget[offset * 3 + 2] = tex1Dfetch(rT1, offset * 3 + 2); imageTarget[offset * 3 + 1] = tex1Dfetch(rT1, offset * 3 + 1); imageTarget[offset * 3 + 0] = tex1Dfetch(rT1, offset * 3 + 0);}void main(int argc, char** argv){ Mat image = imread("D://lena.jpg"); Mat imageSource; resize(image, imageSource, Size(DIM, DIM)); //調整圖像大小 Mat imageTarget = Mat(Size(DIM, DIM), CV_8UC3, Scalar::all(0)); //分配空間 unsigned char *dev_imageSource; unsigned char *dev_imageTarget; cudaMalloc((void**)&dev_imageSource, 3 * imageSource.rows*imageSource.cols); cudaMalloc((void**)&dev_imageTarget, 3 * imageSource.rows*imageSource.cols); cudaMemcpy(dev_imageSource, imageSource.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice); cudaMemcpy(dev_imageTarget, imageTarget.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice); //綁定紋理 cudaBindTexture(0, rT1, dev_imageSource); dim3 grids(DIM / 16, DIM / 16); dim3 threads(16, 16); //調用kernel Kernel_Copy << < grids, threads >> > (dev_imageTarget); //將結果拷貝到host端 ☆host就是CPU cudaMemcpy(imageTarget.data, dev_imageTarget, 3 * imageSource.cols*imageSource.rows, cudaMemcpyDeviceToHost); imshow("CUDA紋理內存使用示例", imageTarget); waitKey(); //解除紋理綁定 cudaUnbindTexture(rT1); cudaFree(dev_imageSource); cudaFree(dev_imageSource);}運行結果:
新聞熱點
疑難解答